x264 source for verification 2026-05-22
This commit is contained in:
56
common/aarch64/asm-offsets.c
Normal file
56
common/aarch64/asm-offsets.c
Normal file
@@ -0,0 +1,56 @@
|
||||
/*****************************************************************************
|
||||
* asm-offsets.c: check asm offsets for aarch64
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2014-2025 x264 project
|
||||
*
|
||||
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common/common.h"
|
||||
#include "asm-offsets.h"
|
||||
|
||||
#define STATIC_ASSERT(name, x) int assert_##name[2 * !!(x) - 1]
|
||||
|
||||
#define X264_CHECK_OFFSET(s, m, o) struct check_##s##_##m \
|
||||
{ \
|
||||
STATIC_ASSERT(offset_##m, offsetof(s, m) == o); \
|
||||
}
|
||||
|
||||
#define X264_CHECK_REL_OFFSET(s, a, type, b) struct check_##s##_##a##_##b \
|
||||
{ \
|
||||
STATIC_ASSERT(rel_offset_##a##_##b, offsetof(s, a) + sizeof(type) == offsetof(s, b)); \
|
||||
}
|
||||
|
||||
|
||||
X264_CHECK_OFFSET(x264_cabac_t, i_low, CABAC_I_LOW);
|
||||
X264_CHECK_OFFSET(x264_cabac_t, i_range, CABAC_I_RANGE);
|
||||
X264_CHECK_OFFSET(x264_cabac_t, i_queue, CABAC_I_QUEUE);
|
||||
X264_CHECK_OFFSET(x264_cabac_t, i_bytes_outstanding, CABAC_I_BYTES_OUTSTANDING);
|
||||
X264_CHECK_OFFSET(x264_cabac_t, p_start, CABAC_P_START);
|
||||
X264_CHECK_OFFSET(x264_cabac_t, p, CABAC_P);
|
||||
X264_CHECK_OFFSET(x264_cabac_t, p_end, CABAC_P_END);
|
||||
X264_CHECK_OFFSET(x264_cabac_t, f8_bits_encoded, CABAC_F8_BITS_ENCODED);
|
||||
X264_CHECK_OFFSET(x264_cabac_t, state, CABAC_STATE);
|
||||
|
||||
// the aarch64 asm makes following additional assumptions about the x264_cabac_t
|
||||
// memory layout
|
||||
|
||||
X264_CHECK_REL_OFFSET(x264_cabac_t, i_low, int, i_range);
|
||||
X264_CHECK_REL_OFFSET(x264_cabac_t, i_queue, int, i_bytes_outstanding);
|
||||
39
common/aarch64/asm-offsets.h
Normal file
39
common/aarch64/asm-offsets.h
Normal file
@@ -0,0 +1,39 @@
|
||||
/*****************************************************************************
|
||||
* asm-offsets.h: asm offsets for aarch64
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2014-2025 x264 project
|
||||
*
|
||||
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_AARCH64_ASM_OFFSETS_H
|
||||
#define X264_AARCH64_ASM_OFFSETS_H
|
||||
|
||||
#define CABAC_I_LOW 0x00
|
||||
#define CABAC_I_RANGE 0x04
|
||||
#define CABAC_I_QUEUE 0x08
|
||||
#define CABAC_I_BYTES_OUTSTANDING 0x0c
|
||||
#define CABAC_P_START 0x10
|
||||
#define CABAC_P 0x18
|
||||
#define CABAC_P_END 0x20
|
||||
#define CABAC_F8_BITS_ENCODED 0x30
|
||||
#define CABAC_STATE 0x34
|
||||
|
||||
#endif
|
||||
291
common/aarch64/asm.S
Normal file
291
common/aarch64/asm.S
Normal file
@@ -0,0 +1,291 @@
|
||||
/*****************************************************************************
|
||||
* asm.S: AArch64 utility macros
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2008-2025 x264 project
|
||||
*
|
||||
* Authors: Mans Rullgard <mans@mansr.com>
|
||||
* David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#define GLUE(a, b) a ## b
|
||||
#define JOIN(a, b) GLUE(a, b)
|
||||
|
||||
#ifdef PREFIX
|
||||
# define BASE _x264_
|
||||
# define SYM_PREFIX _
|
||||
#else
|
||||
# define BASE x264_
|
||||
# define SYM_PREFIX
|
||||
#endif
|
||||
|
||||
#ifdef BIT_DEPTH
|
||||
# define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
|
||||
#else
|
||||
# define EXTERN_ASM BASE
|
||||
#endif
|
||||
|
||||
#define X(s) JOIN(EXTERN_ASM, s)
|
||||
#define X264(s) JOIN(BASE, s)
|
||||
#define EXT(s) JOIN(SYM_PREFIX, s)
|
||||
|
||||
#ifdef __ELF__
|
||||
# define ELF
|
||||
#else
|
||||
# define ELF #
|
||||
#endif
|
||||
|
||||
#ifdef __MACH__
|
||||
# define MACH
|
||||
#else
|
||||
# define MACH #
|
||||
#endif
|
||||
|
||||
#if HAVE_AS_FUNC
|
||||
# define FUNC
|
||||
#else
|
||||
# define FUNC #
|
||||
#endif
|
||||
|
||||
.arch AS_ARCH_LEVEL
|
||||
#if HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE
|
||||
#define ENABLE_DOTPROD .arch_extension dotprod
|
||||
#define DISABLE_DOTPROD .arch_extension nodotprod
|
||||
#else
|
||||
#define ENABLE_DOTPROD
|
||||
#define DISABLE_DOTPROD
|
||||
#endif
|
||||
#if HAVE_AS_ARCHEXT_I8MM_DIRECTIVE
|
||||
#define ENABLE_I8MM .arch_extension i8mm
|
||||
#define DISABLE_I8MM .arch_extension noi8mm
|
||||
#else
|
||||
#define ENABLE_I8MM
|
||||
#define DISABLE_I8MM
|
||||
#endif
|
||||
#if HAVE_AS_ARCHEXT_SVE_DIRECTIVE
|
||||
#define ENABLE_SVE .arch_extension sve
|
||||
#define DISABLE_SVE .arch_extension nosve
|
||||
#else
|
||||
#define ENABLE_SVE
|
||||
#define DISABLE_SVE
|
||||
#endif
|
||||
#if HAVE_AS_ARCHEXT_SVE2_DIRECTIVE
|
||||
#define ENABLE_SVE2 .arch_extension sve2
|
||||
#define DISABLE_SVE2 .arch_extension nosve2
|
||||
#else
|
||||
#define ENABLE_SVE2
|
||||
#define DISABLE_SVE2
|
||||
#endif
|
||||
|
||||
/* If we do support the .arch_extension directives, disable support for all
|
||||
* the extensions that we may use, in case they were implicitly enabled by
|
||||
* the .arch level. This makes it clear if we try to assemble an instruction
|
||||
* from an unintended extension set; we only allow assmbling such instructions
|
||||
* within regions where we explicitly enable those extensions. */
|
||||
DISABLE_DOTPROD
|
||||
DISABLE_I8MM
|
||||
DISABLE_SVE
|
||||
DISABLE_SVE2
|
||||
|
||||
.macro function name, export=0, align=2
|
||||
.macro endfunc
|
||||
.if \export
|
||||
ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name
|
||||
.else
|
||||
ELF .size \name, . - \name
|
||||
.endif
|
||||
FUNC .endfunc
|
||||
.purgem endfunc
|
||||
.endm
|
||||
.text
|
||||
.align \align
|
||||
.if \export
|
||||
.global EXTERN_ASM\name
|
||||
ELF .type EXTERN_ASM\name, %function
|
||||
FUNC .func EXTERN_ASM\name
|
||||
EXTERN_ASM\name:
|
||||
.else
|
||||
ELF .type \name, %function
|
||||
FUNC .func \name
|
||||
\name:
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro const name, align=2
|
||||
.macro endconst
|
||||
ELF .size \name, . - \name
|
||||
.purgem endconst
|
||||
.endm
|
||||
ELF .section .rodata
|
||||
MACH .const_data
|
||||
.align \align
|
||||
\name:
|
||||
.endm
|
||||
|
||||
.macro movrel rd, val, offset=0
|
||||
#if defined(__APPLE__)
|
||||
.if \offset < 0
|
||||
adrp \rd, \val@PAGE
|
||||
add \rd, \rd, \val@PAGEOFF
|
||||
sub \rd, \rd, -(\offset)
|
||||
.else
|
||||
adrp \rd, \val+(\offset)@PAGE
|
||||
add \rd, \rd, \val+(\offset)@PAGEOFF
|
||||
.endif
|
||||
#elif defined(PIC) && defined(_WIN32)
|
||||
.if \offset < 0
|
||||
adrp \rd, \val
|
||||
add \rd, \rd, :lo12:\val
|
||||
sub \rd, \rd, -(\offset)
|
||||
.else
|
||||
adrp \rd, \val+(\offset)
|
||||
add \rd, \rd, :lo12:\val+(\offset)
|
||||
.endif
|
||||
#elif defined(PIC)
|
||||
adrp \rd, \val+(\offset)
|
||||
add \rd, \rd, :lo12:\val+(\offset)
|
||||
#else
|
||||
ldr \rd, =\val+\offset
|
||||
#endif
|
||||
.endm
|
||||
|
||||
#define FDEC_STRIDE 32
|
||||
#define FENC_STRIDE 16
|
||||
|
||||
|
||||
.macro SUMSUB_AB sum, sub, a, b
|
||||
add \sum, \a, \b
|
||||
sub \sub, \a, \b
|
||||
.endm
|
||||
|
||||
.macro unzip t1, t2, s1, s2
|
||||
uzp1 \t1, \s1, \s2
|
||||
uzp2 \t2, \s1, \s2
|
||||
.endm
|
||||
|
||||
.macro transpose t1, t2, s1, s2
|
||||
trn1 \t1, \s1, \s2
|
||||
trn2 \t2, \s1, \s2
|
||||
.endm
|
||||
|
||||
.macro transpose4x4.h v0, v1, v2, v3, t0, t1, t2, t3
|
||||
transpose \t0\().2s, \t2\().2s, \v0\().2s, \v2\().2s
|
||||
transpose \t1\().2s, \t3\().2s, \v1\().2s, \v3\().2s
|
||||
transpose \v0\().4h, \v1\().4h, \t0\().4h, \t1\().4h
|
||||
transpose \v2\().4h, \v3\().4h, \t2\().4h, \t3\().4h
|
||||
.endm
|
||||
|
||||
.macro transpose4x8.h v0, v1, v2, v3, t0, t1, t2, t3
|
||||
transpose \t0\().4s, \t2\().4s, \v0\().4s, \v2\().4s
|
||||
transpose \t1\().4s, \t3\().4s, \v1\().4s, \v3\().4s
|
||||
transpose \v0\().8h, \v1\().8h, \t0\().8h, \t1\().8h
|
||||
transpose \v2\().8h, \v3\().8h, \t2\().8h, \t3\().8h
|
||||
.endm
|
||||
|
||||
|
||||
.macro transpose8x8.h r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8h, \r0\().8h, \r1\().8h
|
||||
trn2 \r9\().8h, \r0\().8h, \r1\().8h
|
||||
trn1 \r1\().8h, \r2\().8h, \r3\().8h
|
||||
trn2 \r3\().8h, \r2\().8h, \r3\().8h
|
||||
trn1 \r0\().8h, \r4\().8h, \r5\().8h
|
||||
trn2 \r5\().8h, \r4\().8h, \r5\().8h
|
||||
trn1 \r2\().8h, \r6\().8h, \r7\().8h
|
||||
trn2 \r7\().8h, \r6\().8h, \r7\().8h
|
||||
|
||||
trn1 \r4\().4s, \r0\().4s, \r2\().4s
|
||||
trn2 \r2\().4s, \r0\().4s, \r2\().4s
|
||||
trn1 \r6\().4s, \r5\().4s, \r7\().4s
|
||||
trn2 \r7\().4s, \r5\().4s, \r7\().4s
|
||||
trn1 \r5\().4s, \r9\().4s, \r3\().4s
|
||||
trn2 \r9\().4s, \r9\().4s, \r3\().4s
|
||||
trn1 \r3\().4s, \r8\().4s, \r1\().4s
|
||||
trn2 \r8\().4s, \r8\().4s, \r1\().4s
|
||||
|
||||
trn1 \r0\().2d, \r3\().2d, \r4\().2d
|
||||
trn2 \r4\().2d, \r3\().2d, \r4\().2d
|
||||
|
||||
trn1 \r1\().2d, \r5\().2d, \r6\().2d
|
||||
trn2 \r5\().2d, \r5\().2d, \r6\().2d
|
||||
|
||||
trn2 \r6\().2d, \r8\().2d, \r2\().2d
|
||||
trn1 \r2\().2d, \r8\().2d, \r2\().2d
|
||||
|
||||
trn1 \r3\().2d, \r9\().2d, \r7\().2d
|
||||
trn2 \r7\().2d, \r9\().2d, \r7\().2d
|
||||
.endm
|
||||
|
||||
.macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
|
||||
trn1 \t0\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \t1\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \r1\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \r3\().16b, \r2\().16b, \r3\().16b
|
||||
trn1 \r0\().16b, \r4\().16b, \r5\().16b
|
||||
trn2 \r5\().16b, \r4\().16b, \r5\().16b
|
||||
trn1 \r2\().16b, \r6\().16b, \r7\().16b
|
||||
trn2 \r7\().16b, \r6\().16b, \r7\().16b
|
||||
|
||||
trn1 \r4\().8h, \r0\().8h, \r2\().8h
|
||||
trn2 \r2\().8h, \r0\().8h, \r2\().8h
|
||||
trn1 \r6\().8h, \r5\().8h, \r7\().8h
|
||||
trn2 \r7\().8h, \r5\().8h, \r7\().8h
|
||||
trn1 \r5\().8h, \t1\().8h, \r3\().8h
|
||||
trn2 \t1\().8h, \t1\().8h, \r3\().8h
|
||||
trn1 \r3\().8h, \t0\().8h, \r1\().8h
|
||||
trn2 \t0\().8h, \t0\().8h, \r1\().8h
|
||||
|
||||
trn1 \r0\().4s, \r3\().4s, \r4\().4s
|
||||
trn2 \r4\().4s, \r3\().4s, \r4\().4s
|
||||
|
||||
trn1 \r1\().4s, \r5\().4s, \r6\().4s
|
||||
trn2 \r5\().4s, \r5\().4s, \r6\().4s
|
||||
|
||||
trn2 \r6\().4s, \t0\().4s, \r2\().4s
|
||||
trn1 \r2\().4s, \t0\().4s, \r2\().4s
|
||||
|
||||
trn1 \r3\().4s, \t1\().4s, \r7\().4s
|
||||
trn2 \r7\().4s, \t1\().4s, \r7\().4s
|
||||
.endm
|
||||
|
||||
.macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().16b, \r0\().16b, \r1\().16b
|
||||
trn2 \t5\().16b, \r0\().16b, \r1\().16b
|
||||
trn1 \t6\().16b, \r2\().16b, \r3\().16b
|
||||
trn2 \t7\().16b, \r2\().16b, \r3\().16b
|
||||
|
||||
trn1 \r0\().8h, \t4\().8h, \t6\().8h
|
||||
trn2 \r2\().8h, \t4\().8h, \t6\().8h
|
||||
trn1 \r1\().8h, \t5\().8h, \t7\().8h
|
||||
trn2 \r3\().8h, \t5\().8h, \t7\().8h
|
||||
.endm
|
||||
|
||||
.macro transpose_4x8.b r0, r1, r2, r3, t4, t5, t6, t7
|
||||
trn1 \t4\().8b, \r0\().8b, \r1\().8b
|
||||
trn2 \t5\().8b, \r0\().8b, \r1\().8b
|
||||
trn1 \t6\().8b, \r2\().8b, \r3\().8b
|
||||
trn2 \t7\().8b, \r2\().8b, \r3\().8b
|
||||
|
||||
trn1 \r0\().4h, \t4\().4h, \t6\().4h
|
||||
trn2 \r2\().4h, \t4\().4h, \t6\().4h
|
||||
trn1 \r1\().4h, \t5\().4h, \t7\().4h
|
||||
trn2 \r3\().4h, \t5\().4h, \t7\().4h
|
||||
.endm
|
||||
82
common/aarch64/bitstream-a.S
Normal file
82
common/aarch64/bitstream-a.S
Normal file
@@ -0,0 +1,82 @@
|
||||
/*****************************************************************************
|
||||
* bitstream-a.S: aarch64 bitstream functions
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2014-2025 x264 project
|
||||
*
|
||||
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
|
||||
function nal_escape_neon, export=1
|
||||
movi v0.16b, #0xff
|
||||
movi v4.16b, #4
|
||||
mov w3, #3
|
||||
subs x6, x1, x2
|
||||
cbz x6, 99f
|
||||
0:
|
||||
cmn x6, #15
|
||||
b.lt 16f
|
||||
mov x1, x2
|
||||
b 100f
|
||||
16:
|
||||
ld1 {v1.16b}, [x1], #16
|
||||
ext v2.16b, v0.16b, v1.16b, #14
|
||||
ext v3.16b, v0.16b, v1.16b, #15
|
||||
cmhi v7.16b, v4.16b, v1.16b
|
||||
cmeq v5.16b, v2.16b, #0
|
||||
cmeq v6.16b, v3.16b, #0
|
||||
and v5.16b, v5.16b, v7.16b
|
||||
and v5.16b, v5.16b, v6.16b
|
||||
shrn v7.8b, v5.8h, #4
|
||||
mov x7, v7.d[0]
|
||||
cbz x7, 16f
|
||||
mov x6, #-16
|
||||
100:
|
||||
umov w5, v0.b[14]
|
||||
umov w4, v0.b[15]
|
||||
orr w5, w4, w5, lsl #8
|
||||
101:
|
||||
ldrb w4, [x1, x6]
|
||||
orr w9, w4, w5, lsl #16
|
||||
cmp w9, #3
|
||||
b.hi 102f
|
||||
strb w3, [x0], #1
|
||||
orr w5, w3, w5, lsl #8
|
||||
102:
|
||||
adds x6, x6, #1
|
||||
strb w4, [x0], #1
|
||||
orr w5, w4, w5, lsl #8
|
||||
b.lt 101b
|
||||
subs x6, x1, x2
|
||||
lsr w9, w5, #8
|
||||
mov v0.b[14], w9
|
||||
mov v0.b[15], w5
|
||||
b.lt 0b
|
||||
|
||||
ret
|
||||
16:
|
||||
subs x6, x1, x2
|
||||
st1 {v1.16b}, [x0], #16
|
||||
mov v0.16b, v1.16b
|
||||
b.lt 0b
|
||||
99:
|
||||
ret
|
||||
endfunc
|
||||
32
common/aarch64/bitstream.h
Normal file
32
common/aarch64/bitstream.h
Normal file
@@ -0,0 +1,32 @@
|
||||
/*****************************************************************************
|
||||
* bitstream.h: aarch64 bitstream functions
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2017-2025 x264 project
|
||||
*
|
||||
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_AARCH64_BITSTREAM_H
|
||||
#define X264_AARCH64_BITSTREAM_H
|
||||
|
||||
#define x264_nal_escape_neon x264_template(nal_escape_neon)
|
||||
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
|
||||
|
||||
#endif
|
||||
131
common/aarch64/cabac-a.S
Normal file
131
common/aarch64/cabac-a.S
Normal file
@@ -0,0 +1,131 @@
|
||||
/*****************************************************************************
|
||||
* cabac-a.S: aarch64 cabac
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2014-2025 x264 project
|
||||
*
|
||||
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
#include "asm-offsets.h"
|
||||
|
||||
// w11 holds x264_cabac_t.i_low
|
||||
// w12 holds x264_cabac_t.i_range
|
||||
|
||||
function cabac_encode_decision_asm, export=1
|
||||
add w10, w1, #CABAC_STATE
|
||||
ldrb w3, [x0, w10, uxtw] // i_state
|
||||
ldr w12, [x0, #CABAC_I_RANGE]
|
||||
movrel x8, X264(cabac_range_lps), -4
|
||||
movrel x9, X264(cabac_transition)
|
||||
ubfx x4, x3, #1, #7
|
||||
asr w5, w12, #6
|
||||
add x8, x8, x4, lsl #2
|
||||
orr w14, w2, w3, lsl #1
|
||||
ldrb w4, [x8, w5, uxtw] // i_range_lps
|
||||
ldr w11, [x0, #CABAC_I_LOW]
|
||||
eor w6, w2, w3 // b ^ i_state
|
||||
ldrb w9, [x9, w14, uxtw]
|
||||
sub w12, w12, w4
|
||||
add w7, w11, w12
|
||||
tst w6, #1 // (b ^ i_state) & 1
|
||||
csel w12, w4, w12, ne
|
||||
csel w11, w7, w11, ne
|
||||
strb w9, [x0, w10, uxtw] // i_state
|
||||
|
||||
cabac_encode_renorm:
|
||||
ldr w2, [x0, #CABAC_I_QUEUE]
|
||||
clz w5, w12
|
||||
sub w5, w5, #23
|
||||
lsl w11, w11, w5
|
||||
lsl w12, w12, w5
|
||||
adds w2, w2, w5
|
||||
b.ge cabac_putbyte
|
||||
|
||||
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
|
||||
str w2, [x0, #CABAC_I_QUEUE]
|
||||
ret
|
||||
|
||||
.align 5
|
||||
cabac_putbyte:
|
||||
ldr w6, [x0, #CABAC_I_BYTES_OUTSTANDING]
|
||||
add w14, w2, #10
|
||||
mov w13, #-1
|
||||
sub w2, w2, #8
|
||||
asr w4, w11, w14 // out
|
||||
lsl w13, w13, w14
|
||||
subs w5, w4, #0xff
|
||||
bic w11, w11, w13
|
||||
cinc w6, w6, eq
|
||||
b.eq 0f
|
||||
|
||||
1:
|
||||
ldr x7, [x0, #CABAC_P]
|
||||
asr w5, w4, #8 // carry
|
||||
ldurb w8, [x7, #-1]
|
||||
add w8, w8, w5
|
||||
sub w5, w5, #1
|
||||
sturb w8, [x7, #-1]
|
||||
cbz w6, 3f
|
||||
2:
|
||||
subs w6, w6, #1
|
||||
strb w5, [x7], #1
|
||||
b.gt 2b
|
||||
3:
|
||||
strb w4, [x7], #1
|
||||
str x7, [x0, #CABAC_P]
|
||||
0:
|
||||
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
|
||||
stp w2, w6, [x0, #CABAC_I_QUEUE] // store i_queue, i_bytes_outstanding
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function cabac_encode_bypass_asm, export=1, align=5
|
||||
ldr w12, [x0, #CABAC_I_RANGE]
|
||||
ldr w11, [x0, #CABAC_I_LOW]
|
||||
ldr w2, [x0, #CABAC_I_QUEUE]
|
||||
and w1, w1, w12
|
||||
add w11, w1, w11, lsl #1
|
||||
adds w2, w2, #1
|
||||
b.ge cabac_putbyte
|
||||
str w11, [x0, #CABAC_I_LOW]
|
||||
str w2, [x0, #CABAC_I_QUEUE]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function cabac_encode_terminal_asm, export=1, align=5
|
||||
ldr w12, [x0, #CABAC_I_RANGE]
|
||||
sub w12, w12, #2
|
||||
tbz w12, #8, 1f
|
||||
|
||||
str w12, [x0, #CABAC_I_RANGE]
|
||||
ret
|
||||
1:
|
||||
ldr w2, [x0, #CABAC_I_QUEUE]
|
||||
ldr w11, [x0, #CABAC_I_LOW]
|
||||
lsl w12, w12, #1
|
||||
adds w2, w2, #1
|
||||
lsl w11, w11, #1
|
||||
b.ge cabac_putbyte
|
||||
|
||||
stp w11, w12, [x0, #CABAC_I_LOW] // store i_low, i_range
|
||||
str w2, [x0, #CABAC_I_QUEUE]
|
||||
ret
|
||||
endfunc
|
||||
40
common/aarch64/dct-a-common.S
Normal file
40
common/aarch64/dct-a-common.S
Normal file
@@ -0,0 +1,40 @@
|
||||
/****************************************************************************
|
||||
* dct-a-common.S: aarch64 transform and zigzag
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
* David Chen <david.chen@myais.com.cn>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
// This file contains the NEON macros that are intended to be used by
|
||||
// the SVE/SVE2 functions as well
|
||||
|
||||
.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
|
||||
SUMSUB_AB \v1, \v6, \v5, \v6
|
||||
SUMSUB_AB \v3, \v7, \v4, \v7
|
||||
add \v0, \v3, \v1
|
||||
add \v4, \v7, \v7
|
||||
add \v5, \v6, \v6
|
||||
sub \v2, \v3, \v1
|
||||
add \v1, \v4, \v6
|
||||
sub \v3, \v7, \v5
|
||||
.endm
|
||||
88
common/aarch64/dct-a-sve.S
Normal file
88
common/aarch64/dct-a-sve.S
Normal file
@@ -0,0 +1,88 @@
|
||||
/****************************************************************************
|
||||
* dct-a-sve.S: aarch64 transform and zigzag
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Chen <david.chen@myais.com.cn>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
#include "dct-a-common.S"
|
||||
|
||||
ENABLE_SVE
|
||||
|
||||
function sub4x4_dct_sve, export=1
|
||||
mov x3, #FENC_STRIDE
|
||||
mov x4, #FDEC_STRIDE
|
||||
ptrue p0.h, vl4
|
||||
ld1b {z0.h}, p0/z, [x1]
|
||||
add x1, x1, x3
|
||||
ld1b {z1.h}, p0/z, [x2]
|
||||
add x2, x2, x4
|
||||
ld1b {z2.h}, p0/z, [x1]
|
||||
add x1, x1, x3
|
||||
sub v16.4h, v0.4h, v1.4h
|
||||
ld1b {z3.h}, p0/z, [x2]
|
||||
add x2, x2, x4
|
||||
ld1b {z4.h}, p0/z, [x1]
|
||||
add x1, x1, x3
|
||||
sub v17.4h, v2.4h, v3.4h
|
||||
ld1b {z5.h}, p0/z, [x2]
|
||||
add x2, x2, x4
|
||||
ld1b {z6.h}, p0/z, [x1]
|
||||
sub v18.4h, v4.4h, v5.4h
|
||||
ld1b {z7.h}, p0/z, [x2]
|
||||
sub v19.4h, v6.4h, v7.4h
|
||||
|
||||
DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
|
||||
transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
|
||||
DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
|
||||
st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function zigzag_interleave_8x8_cavlc_sve, export=1
|
||||
mov z31.s, #1
|
||||
ptrue p2.s, vl2
|
||||
ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
|
||||
ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
|
||||
umax v16.8h, v0.8h, v4.8h
|
||||
umax v17.8h, v1.8h, v5.8h
|
||||
umax v18.8h, v2.8h, v6.8h
|
||||
umax v19.8h, v3.8h, v7.8h
|
||||
st1 {v0.8h}, [x0], #16
|
||||
st1 {v4.8h}, [x0], #16
|
||||
umaxp v16.8h, v16.8h, v17.8h
|
||||
umaxp v18.8h, v18.8h, v19.8h
|
||||
st1 {v1.8h}, [x0], #16
|
||||
st1 {v5.8h}, [x0], #16
|
||||
umaxp v16.8h, v16.8h, v18.8h
|
||||
st1 {v2.8h}, [x0], #16
|
||||
st1 {v6.8h}, [x0], #16
|
||||
cmhs v16.4s, v16.4s, v31.4s
|
||||
st1 {v3.8h}, [x0], #16
|
||||
and v16.16b, v16.16b, v31.16b
|
||||
st1 {v7.8h}, [x0], #16
|
||||
st1b {z16.s}, p2, [x2]
|
||||
add x2, x2, #8
|
||||
mov v16.d[0], v16.d[1]
|
||||
st1b {z16.s}, p2, [x2]
|
||||
ret
|
||||
endfunc
|
||||
90
common/aarch64/dct-a-sve2.S
Normal file
90
common/aarch64/dct-a-sve2.S
Normal file
@@ -0,0 +1,90 @@
|
||||
/****************************************************************************
|
||||
* dct-a-sve2.S: aarch64 transform and zigzag
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Chen <david.chen@myais.com.cn>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
#include "dct-a-common.S"
|
||||
|
||||
ENABLE_SVE
|
||||
ENABLE_SVE2
|
||||
|
||||
function add4x4_idct_sve2, export=1
|
||||
mov x2, #FDEC_STRIDE
|
||||
mov x11, x0
|
||||
ptrue p0.h, vl8
|
||||
ptrue p1.h, vl4
|
||||
ld1 {v0.8h, v1.8h}, [x1]
|
||||
|
||||
SUMSUB_AB v4.8h, v5.8h, v0.8h, v1.8h
|
||||
|
||||
sshr v7.8h, v0.8h, #1
|
||||
sshr v6.8h, v1.8h, #1
|
||||
sub v7.8h, v7.8h, v1.8h
|
||||
add v6.8h, v6.8h, v0.8h
|
||||
mov v7.d[0], v7.d[1]
|
||||
mov v6.d[0], v6.d[1]
|
||||
ld1b {z28.h}, p0/z, [x11]
|
||||
add x11, x11, x2
|
||||
SUMSUB_AB v0.8h, v2.8h, v4.8h, v6.8h
|
||||
SUMSUB_AB v1.8h, v3.8h, v5.8h, v7.8h
|
||||
|
||||
transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
|
||||
|
||||
SUMSUB_AB v4.4h, v5.4h, v0.4h, v3.4h
|
||||
|
||||
sshr v7.4h, v1.4h, #1
|
||||
sshr v6.4h, v2.4h, #1
|
||||
sub v7.4h, v7.4h, v2.4h
|
||||
add v6.4h, v6.4h, v1.4h
|
||||
ld1b {z29.h}, p0/z, [x11]
|
||||
add x11, x11, x2
|
||||
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
|
||||
|
||||
srshr z0.h, p1/m, z0.h, #6
|
||||
srshr z1.h, p1/m, z1.h, #6
|
||||
ld1b {z31.h}, p0/z, [x11]
|
||||
add x11, x11, x2
|
||||
srshr z2.h, p1/m, z2.h, #6
|
||||
srshr z3.h, p1/m, z3.h, #6
|
||||
ld1b {z30.h}, p0/z, [x11]
|
||||
|
||||
add v0.8h, v0.8h, v28.8h
|
||||
add v1.8h, v1.8h, v29.8h
|
||||
add v2.8h, v2.8h, v30.8h
|
||||
add v3.8h, v3.8h, v31.8h
|
||||
sqxtunb z0.b, z0.h
|
||||
sqxtunb z1.b, z1.h
|
||||
sqxtunb z2.b, z2.h
|
||||
sqxtunb z3.b, z3.h
|
||||
|
||||
st1b {z0.h}, p1, [x0]
|
||||
add x0, x0, x2
|
||||
st1b {z1.h}, p1, [x0]
|
||||
add x0, x0, x2
|
||||
st1b {z3.h}, p1, [x0]
|
||||
add x0, x0, x2
|
||||
st1b {z2.h}, p1, [x0]
|
||||
ret
|
||||
endfunc
|
||||
998
common/aarch64/dct-a.S
Normal file
998
common/aarch64/dct-a.S
Normal file
@@ -0,0 +1,998 @@
|
||||
/****************************************************************************
|
||||
* dct-a.S: aarch64 transform and zigzag
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
#include "dct-a-common.S"
|
||||
|
||||
const scan4x4_frame, align=4
|
||||
.byte 0,1, 8,9, 2,3, 4,5
|
||||
.byte 10,11, 16,17, 24,25, 18,19
|
||||
.byte 12,13, 6,7, 14,15, 20,21
|
||||
.byte 26,27, 28,29, 22,23, 30,31
|
||||
endconst
|
||||
|
||||
const scan4x4_field, align=4
|
||||
.byte 0,1, 2,3, 8,9, 4,5
|
||||
.byte 6,7, 10,11, 12,13, 14,15
|
||||
endconst
|
||||
|
||||
const sub4x4_frame, align=4
|
||||
.byte 0, 1, 4, 8
|
||||
.byte 5, 2, 3, 6
|
||||
.byte 9, 12, 13, 10
|
||||
.byte 7, 11, 14, 15
|
||||
endconst
|
||||
|
||||
const sub4x4_field, align=4
|
||||
.byte 0, 4, 1, 8
|
||||
.byte 12, 5, 9, 13
|
||||
.byte 2, 6, 10, 14
|
||||
.byte 3, 7, 11, 15
|
||||
endconst
|
||||
|
||||
// sum = a + (b>>shift) sub = (a>>shift) - b
|
||||
.macro SUMSUB_SHR shift sum sub a b t0 t1
|
||||
sshr \t0, \b, #\shift
|
||||
sshr \t1, \a, #\shift
|
||||
add \sum, \a, \t0
|
||||
sub \sub, \t1, \b
|
||||
.endm
|
||||
|
||||
// sum = (a>>shift) + b sub = a - (b>>shift)
|
||||
.macro SUMSUB_SHR2 shift sum sub a b t0 t1
|
||||
sshr \t0, \a, #\shift
|
||||
sshr \t1, \b, #\shift
|
||||
add \sum, \t0, \b
|
||||
sub \sub, \a, \t1
|
||||
.endm
|
||||
|
||||
// a += 1.5*ma b -= 1.5*mb
|
||||
.macro SUMSUB_15 a b ma mb t0 t1
|
||||
sshr \t0, \ma, #1
|
||||
sshr \t1, \mb, #1
|
||||
add \t0, \t0, \ma
|
||||
add \t1, \t1, \mb
|
||||
add \a, \a, \t0
|
||||
sub \b, \b, \t1
|
||||
.endm
|
||||
|
||||
|
||||
function dct4x4dc_neon, export=1
|
||||
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
|
||||
movi v31.4h, #1
|
||||
SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
|
||||
SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
|
||||
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||
SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
|
||||
transpose v4.4h, v6.4h, v0.4h, v2.4h
|
||||
transpose v5.4h, v7.4h, v1.4h, v3.4h
|
||||
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
|
||||
transpose v4.2s, v5.2s, v0.2s, v1.2s
|
||||
transpose v6.2s, v7.2s, v2.2s, v3.2s
|
||||
add v16.4h, v4.4h, v31.4h
|
||||
add v17.4h, v6.4h, v31.4h
|
||||
srhadd v0.4h, v4.4h, v5.4h
|
||||
shsub v1.4h, v16.4h, v5.4h
|
||||
shsub v2.4h, v17.4h, v7.4h
|
||||
srhadd v3.4h, v6.4h, v7.4h
|
||||
st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function idct4x4dc_neon, export=1
|
||||
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
|
||||
SUMSUB_AB v4.4h, v5.4h, v0.4h, v1.4h
|
||||
SUMSUB_AB v6.4h, v7.4h, v2.4h, v3.4h
|
||||
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||
SUMSUB_AB v3.4h, v1.4h, v5.4h, v7.4h
|
||||
transpose v4.4h, v6.4h, v0.4h, v2.4h
|
||||
transpose v5.4h, v7.4h, v1.4h, v3.4h
|
||||
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
|
||||
transpose v4.2s, v5.2s, v0.2s, v1.2s
|
||||
transpose v6.2s, v7.2s, v2.2s, v3.2s
|
||||
SUMSUB_AB v0.4h, v1.4h, v4.4h, v5.4h
|
||||
SUMSUB_AB v3.4h, v2.4h, v6.4h, v7.4h
|
||||
st1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function sub4x4_dct_neon, export=1
|
||||
mov x3, #FENC_STRIDE
|
||||
mov x4, #FDEC_STRIDE
|
||||
ld1 {v0.s}[0], [x1], x3
|
||||
ld1 {v1.s}[0], [x2], x4
|
||||
ld1 {v2.s}[0], [x1], x3
|
||||
usubl v16.8h, v0.8b, v1.8b
|
||||
ld1 {v3.s}[0], [x2], x4
|
||||
ld1 {v4.s}[0], [x1], x3
|
||||
usubl v17.8h, v2.8b, v3.8b
|
||||
ld1 {v5.s}[0], [x2], x4
|
||||
ld1 {v6.s}[0], [x1], x3
|
||||
usubl v18.8h, v4.8b, v5.8b
|
||||
ld1 {v7.s}[0], [x2], x4
|
||||
usubl v19.8h, v6.8b, v7.8b
|
||||
|
||||
DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
|
||||
transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
|
||||
DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
|
||||
st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function sub8x4_dct_neon
|
||||
ld1 {v0.8b}, [x1], x3
|
||||
ld1 {v1.8b}, [x2], x4
|
||||
usubl v16.8h, v0.8b, v1.8b
|
||||
ld1 {v2.8b}, [x1], x3
|
||||
ld1 {v3.8b}, [x2], x4
|
||||
usubl v17.8h, v2.8b, v3.8b
|
||||
ld1 {v4.8b}, [x1], x3
|
||||
ld1 {v5.8b}, [x2], x4
|
||||
usubl v18.8h, v4.8b, v5.8b
|
||||
ld1 {v6.8b}, [x1], x3
|
||||
ld1 {v7.8b}, [x2], x4
|
||||
usubl v19.8h, v6.8b, v7.8b
|
||||
|
||||
DCT_1D v0.8h, v1.8h, v2.8h, v3.8h, v16.8h, v17.8h, v18.8h, v19.8h
|
||||
transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
SUMSUB_AB v16.8h, v19.8h, v0.8h, v3.8h
|
||||
SUMSUB_AB v17.8h, v18.8h, v1.8h, v2.8h
|
||||
add v22.8h, v19.8h, v19.8h
|
||||
add v21.8h, v18.8h, v18.8h
|
||||
add v0.8h, v16.8h, v17.8h
|
||||
sub v1.8h, v16.8h, v17.8h
|
||||
|
||||
add v2.8h, v22.8h, v18.8h
|
||||
sub v3.8h, v19.8h, v21.8h
|
||||
|
||||
zip1 v4.2d, v0.2d, v2.2d
|
||||
zip2 v6.2d, v0.2d, v2.2d
|
||||
zip1 v5.2d, v1.2d, v3.2d
|
||||
zip2 v7.2d, v1.2d, v3.2d
|
||||
|
||||
st1 {v4.8h}, [x0], #16
|
||||
st1 {v5.8h}, [x0], #16
|
||||
st1 {v6.8h}, [x0], #16
|
||||
st1 {v7.8h}, [x0], #16
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function sub8x8_dct_neon, export=1
|
||||
mov x5, x30
|
||||
mov x3, #FENC_STRIDE
|
||||
mov x4, #FDEC_STRIDE
|
||||
bl sub8x4_dct_neon
|
||||
mov x30, x5
|
||||
b sub8x4_dct_neon
|
||||
endfunc
|
||||
|
||||
function sub16x16_dct_neon, export=1
|
||||
mov x5, x30
|
||||
mov x3, #FENC_STRIDE
|
||||
mov x4, #FDEC_STRIDE
|
||||
bl sub8x4_dct_neon
|
||||
bl sub8x4_dct_neon
|
||||
sub x1, x1, #8*FENC_STRIDE-8
|
||||
sub x2, x2, #8*FDEC_STRIDE-8
|
||||
bl sub8x4_dct_neon
|
||||
bl sub8x4_dct_neon
|
||||
sub x1, x1, #8
|
||||
sub x2, x2, #8
|
||||
bl sub8x4_dct_neon
|
||||
bl sub8x4_dct_neon
|
||||
sub x1, x1, #8*FENC_STRIDE-8
|
||||
sub x2, x2, #8*FDEC_STRIDE-8
|
||||
bl sub8x4_dct_neon
|
||||
mov x30, x5
|
||||
b sub8x4_dct_neon
|
||||
endfunc
|
||||
|
||||
|
||||
.macro DCT8_1D type
|
||||
SUMSUB_AB v18.8h, v17.8h, v3.8h, v4.8h // s34/d34
|
||||
SUMSUB_AB v19.8h, v16.8h, v2.8h, v5.8h // s25/d25
|
||||
SUMSUB_AB v22.8h, v21.8h, v1.8h, v6.8h // s16/d16
|
||||
SUMSUB_AB v23.8h, v20.8h, v0.8h, v7.8h // s07/d07
|
||||
|
||||
SUMSUB_AB v24.8h, v26.8h, v23.8h, v18.8h // a0/a2
|
||||
SUMSUB_AB v25.8h, v27.8h, v22.8h, v19.8h // a1/a3
|
||||
|
||||
SUMSUB_AB v30.8h, v29.8h, v20.8h, v17.8h // a6/a5
|
||||
sshr v23.8h, v21.8h, #1
|
||||
sshr v18.8h, v16.8h, #1
|
||||
add v23.8h, v23.8h, v21.8h
|
||||
add v18.8h, v18.8h, v16.8h
|
||||
sub v30.8h, v30.8h, v23.8h
|
||||
sub v29.8h, v29.8h, v18.8h
|
||||
|
||||
SUMSUB_AB v28.8h, v31.8h, v21.8h, v16.8h // a4/a7
|
||||
sshr v22.8h, v20.8h, #1
|
||||
sshr v19.8h, v17.8h, #1
|
||||
add v22.8h, v22.8h, v20.8h
|
||||
add v19.8h, v19.8h, v17.8h
|
||||
add v22.8h, v28.8h, v22.8h
|
||||
add v31.8h, v31.8h, v19.8h
|
||||
|
||||
SUMSUB_AB v0.8h, v4.8h, v24.8h, v25.8h
|
||||
SUMSUB_SHR 2, v1.8h, v7.8h, v22.8h, v31.8h, v16.8h, v17.8h
|
||||
SUMSUB_SHR 1, v2.8h, v6.8h, v26.8h, v27.8h, v18.8h, v19.8h
|
||||
SUMSUB_SHR2 2, v3.8h, v5.8h, v30.8h, v29.8h, v20.8h, v21.8h
|
||||
.endm
|
||||
|
||||
function sub8x8_dct8_neon, export=1
|
||||
mov x3, #FENC_STRIDE
|
||||
mov x4, #FDEC_STRIDE
|
||||
ld1 {v16.8b}, [x1], x3
|
||||
ld1 {v17.8b}, [x2], x4
|
||||
ld1 {v18.8b}, [x1], x3
|
||||
ld1 {v19.8b}, [x2], x4
|
||||
usubl v0.8h, v16.8b, v17.8b
|
||||
ld1 {v20.8b}, [x1], x3
|
||||
ld1 {v21.8b}, [x2], x4
|
||||
usubl v1.8h, v18.8b, v19.8b
|
||||
ld1 {v22.8b}, [x1], x3
|
||||
ld1 {v23.8b}, [x2], x4
|
||||
usubl v2.8h, v20.8b, v21.8b
|
||||
ld1 {v24.8b}, [x1], x3
|
||||
ld1 {v25.8b}, [x2], x4
|
||||
usubl v3.8h, v22.8b, v23.8b
|
||||
ld1 {v26.8b}, [x1], x3
|
||||
ld1 {v27.8b}, [x2], x4
|
||||
usubl v4.8h, v24.8b, v25.8b
|
||||
ld1 {v28.8b}, [x1], x3
|
||||
ld1 {v29.8b}, [x2], x4
|
||||
usubl v5.8h, v26.8b, v27.8b
|
||||
ld1 {v30.8b}, [x1], x3
|
||||
ld1 {v31.8b}, [x2], x4
|
||||
usubl v6.8h, v28.8b, v29.8b
|
||||
usubl v7.8h, v30.8b, v31.8b
|
||||
|
||||
DCT8_1D row
|
||||
transpose8x8.h v0, v1, v2, v3, v4, v5, v6, v7, v30, v31
|
||||
DCT8_1D col
|
||||
|
||||
st1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], #64
|
||||
st1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], #64
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function sub16x16_dct8_neon, export=1
|
||||
mov x7, x30
|
||||
bl X(sub8x8_dct8_neon)
|
||||
sub x1, x1, #FENC_STRIDE*8 - 8
|
||||
sub x2, x2, #FDEC_STRIDE*8 - 8
|
||||
bl X(sub8x8_dct8_neon)
|
||||
sub x1, x1, #8
|
||||
sub x2, x2, #8
|
||||
bl X(sub8x8_dct8_neon)
|
||||
mov x30, x7
|
||||
sub x1, x1, #FENC_STRIDE*8 - 8
|
||||
sub x2, x2, #FDEC_STRIDE*8 - 8
|
||||
b X(sub8x8_dct8_neon)
|
||||
endfunc
|
||||
|
||||
|
||||
// First part of IDCT (minus final SUMSUB_BA)
|
||||
.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
|
||||
SUMSUB_AB \d4, \d5, \d0, \d2
|
||||
sshr \d7, \d1, #1
|
||||
sshr \d6, \d3, #1
|
||||
sub \d7, \d7, \d3
|
||||
add \d6, \d6, \d1
|
||||
.endm
|
||||
|
||||
function add4x4_idct_neon, export=1
|
||||
mov x2, #FDEC_STRIDE
|
||||
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
|
||||
|
||||
IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
|
||||
ld1 {v28.s}[0], [x0], x2
|
||||
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
|
||||
|
||||
transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
|
||||
|
||||
IDCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v3.4h, v2.4h
|
||||
ld1 {v29.s}[0], [x0], x2
|
||||
SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
|
||||
SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
|
||||
|
||||
srshr v0.4h, v0.4h, #6
|
||||
srshr v1.4h, v1.4h, #6
|
||||
ld1 {v31.s}[0], [x0], x2
|
||||
srshr v2.4h, v2.4h, #6
|
||||
srshr v3.4h, v3.4h, #6
|
||||
ld1 {v30.s}[0], [x0], x2
|
||||
|
||||
sub x0, x0, x2, lsl #2
|
||||
uaddw v0.8h, v0.8h, v28.8b
|
||||
uaddw v1.8h, v1.8h, v29.8b
|
||||
uaddw v2.8h, v2.8h, v30.8b
|
||||
uaddw v3.8h, v3.8h, v31.8b
|
||||
sqxtun v0.8b, v0.8h
|
||||
sqxtun v1.8b, v1.8h
|
||||
sqxtun v2.8b, v2.8h
|
||||
sqxtun v3.8b, v3.8h
|
||||
|
||||
st1 {v0.s}[0], [x0], x2
|
||||
st1 {v1.s}[0], [x0], x2
|
||||
st1 {v3.s}[0], [x0], x2
|
||||
st1 {v2.s}[0], [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function add8x4_idct_neon, export=1
|
||||
ld1 {v0.8h,v1.8h}, [x1], #32
|
||||
ld1 {v2.8h,v3.8h}, [x1], #32
|
||||
transpose v20.2d, v21.2d, v0.2d, v2.2d
|
||||
transpose v22.2d, v23.2d, v1.2d, v3.2d
|
||||
IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h
|
||||
SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
|
||||
SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
|
||||
|
||||
transpose4x8.h v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
IDCT_1D v16.8h, v17.8h, v18.8h, v19.8h, v0.8h, v1.8h, v2.8h, v3.8h
|
||||
SUMSUB_AB v0.8h, v3.8h, v16.8h, v18.8h
|
||||
SUMSUB_AB v1.8h, v2.8h, v17.8h, v19.8h
|
||||
|
||||
srshr v0.8h, v0.8h, #6
|
||||
ld1 {v28.8b}, [x0], x2
|
||||
srshr v1.8h, v1.8h, #6
|
||||
ld1 {v29.8b}, [x0], x2
|
||||
srshr v2.8h, v2.8h, #6
|
||||
ld1 {v30.8b}, [x0], x2
|
||||
srshr v3.8h, v3.8h, #6
|
||||
ld1 {v31.8b}, [x0], x2
|
||||
|
||||
sub x0, x0, x2, lsl #2
|
||||
uaddw v0.8h, v0.8h, v28.8b
|
||||
uaddw v1.8h, v1.8h, v29.8b
|
||||
uaddw v2.8h, v2.8h, v30.8b
|
||||
uaddw v3.8h, v3.8h, v31.8b
|
||||
|
||||
sqxtun v0.8b, v0.8h
|
||||
sqxtun v1.8b, v1.8h
|
||||
st1 {v0.8b}, [x0], x2
|
||||
sqxtun v2.8b, v2.8h
|
||||
st1 {v1.8b}, [x0], x2
|
||||
sqxtun v3.8b, v3.8h
|
||||
st1 {v2.8b}, [x0], x2
|
||||
st1 {v3.8b}, [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function add8x8_idct_neon, export=1
|
||||
mov x2, #FDEC_STRIDE
|
||||
mov x5, x30
|
||||
bl X(add8x4_idct_neon)
|
||||
mov x30, x5
|
||||
b X(add8x4_idct_neon)
|
||||
endfunc
|
||||
|
||||
function add16x16_idct_neon, export=1
|
||||
mov x2, #FDEC_STRIDE
|
||||
mov x5, x30
|
||||
bl X(add8x4_idct_neon)
|
||||
bl X(add8x4_idct_neon)
|
||||
sub x0, x0, #8*FDEC_STRIDE-8
|
||||
bl X(add8x4_idct_neon)
|
||||
bl X(add8x4_idct_neon)
|
||||
sub x0, x0, #8
|
||||
bl X(add8x4_idct_neon)
|
||||
bl X(add8x4_idct_neon)
|
||||
sub x0, x0, #8*FDEC_STRIDE-8
|
||||
bl X(add8x4_idct_neon)
|
||||
mov x30, x5
|
||||
b X(add8x4_idct_neon)
|
||||
endfunc
|
||||
|
||||
.macro IDCT8_1D type
|
||||
SUMSUB_AB v0.8h, v1.8h, v16.8h, v20.8h // a0/a2
|
||||
.ifc \type, row
|
||||
ld1 {v22.8h,v23.8h}, [x1], #32
|
||||
.endif
|
||||
SUMSUB_SHR 1, v2.8h, v3.8h, v18.8h, v22.8h, v16.8h, v20.8h // a6/a4
|
||||
SUMSUB_AB v16.8h, v18.8h, v21.8h, v19.8h
|
||||
SUMSUB_15 v16.8h, v18.8h, v17.8h, v23.8h, v20.8h, v22.8h // a7/a1
|
||||
SUMSUB_AB v22.8h, v23.8h, v23.8h, v17.8h
|
||||
SUMSUB_15 v23.8h, v22.8h, v21.8h, v19.8h, v20.8h, v17.8h // a5/a3
|
||||
|
||||
SUMSUB_SHR 2, v21.8h, v22.8h, v22.8h, v23.8h, v19.8h, v17.8h // b3/b5
|
||||
SUMSUB_SHR2 2, v20.8h, v23.8h, v16.8h, v18.8h, v19.8h, v17.8h // b1/b7
|
||||
|
||||
SUMSUB_AB v18.8h, v2.8h, v0.8h, v2.8h // b0/b6
|
||||
SUMSUB_AB v19.8h, v3.8h, v1.8h, v3.8h // b2/b4
|
||||
|
||||
SUMSUB_AB v16.8h, v23.8h, v18.8h, v23.8h
|
||||
SUMSUB_AB v17.8h, v22.8h, v19.8h, v22.8h
|
||||
SUMSUB_AB v18.8h, v21.8h, v3.8h, v21.8h
|
||||
SUMSUB_AB v19.8h, v20.8h, v2.8h, v20.8h
|
||||
.endm
|
||||
|
||||
function add8x8_idct8_neon, export=1
|
||||
mov x2, #FDEC_STRIDE
|
||||
ld1 {v16.8h,v17.8h}, [x1], #32
|
||||
ld1 {v18.8h,v19.8h}, [x1], #32
|
||||
ld1 {v20.8h,v21.8h}, [x1], #32
|
||||
|
||||
IDCT8_1D row
|
||||
|
||||
transpose8x8.h v16, v17, v18, v19, v20, v21, v22, v23, v30, v31
|
||||
|
||||
IDCT8_1D col
|
||||
|
||||
ld1 {v0.8b}, [x0], x2
|
||||
srshr v16.8h, v16.8h, #6
|
||||
ld1 {v1.8b}, [x0], x2
|
||||
srshr v17.8h, v17.8h, #6
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
srshr v18.8h, v18.8h, #6
|
||||
ld1 {v3.8b}, [x0], x2
|
||||
srshr v19.8h, v19.8h, #6
|
||||
ld1 {v4.8b}, [x0], x2
|
||||
srshr v20.8h, v20.8h, #6
|
||||
ld1 {v5.8b}, [x0], x2
|
||||
srshr v21.8h, v21.8h, #6
|
||||
ld1 {v6.8b}, [x0], x2
|
||||
srshr v22.8h, v22.8h, #6
|
||||
ld1 {v7.8b}, [x0], x2
|
||||
srshr v23.8h, v23.8h, #6
|
||||
sub x0, x0, x2, lsl #3
|
||||
|
||||
uaddw v16.8h, v16.8h, v0.8b
|
||||
uaddw v17.8h, v17.8h, v1.8b
|
||||
uaddw v18.8h, v18.8h, v2.8b
|
||||
sqxtun v0.8b, v16.8h
|
||||
sqxtun v1.8b, v17.8h
|
||||
sqxtun v2.8b, v18.8h
|
||||
uaddw v19.8h, v19.8h, v3.8b
|
||||
st1 {v0.8b}, [x0], x2
|
||||
uaddw v20.8h, v20.8h, v4.8b
|
||||
st1 {v1.8b}, [x0], x2
|
||||
uaddw v21.8h, v21.8h, v5.8b
|
||||
st1 {v2.8b}, [x0], x2
|
||||
sqxtun v3.8b, v19.8h
|
||||
sqxtun v4.8b, v20.8h
|
||||
uaddw v22.8h, v22.8h, v6.8b
|
||||
uaddw v23.8h, v23.8h, v7.8b
|
||||
st1 {v3.8b}, [x0], x2
|
||||
sqxtun v5.8b, v21.8h
|
||||
st1 {v4.8b}, [x0], x2
|
||||
sqxtun v6.8b, v22.8h
|
||||
sqxtun v7.8b, v23.8h
|
||||
st1 {v5.8b}, [x0], x2
|
||||
st1 {v6.8b}, [x0], x2
|
||||
st1 {v7.8b}, [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function add16x16_idct8_neon, export=1
|
||||
mov x7, x30
|
||||
bl X(add8x8_idct8_neon)
|
||||
sub x0, x0, #8*FDEC_STRIDE-8
|
||||
bl X(add8x8_idct8_neon)
|
||||
sub x0, x0, #8
|
||||
bl X(add8x8_idct8_neon)
|
||||
sub x0, x0, #8*FDEC_STRIDE-8
|
||||
mov x30, x7
|
||||
b X(add8x8_idct8_neon)
|
||||
endfunc
|
||||
|
||||
function add8x8_idct_dc_neon, export=1
|
||||
mov x2, #FDEC_STRIDE
|
||||
ld1 {v16.4h}, [x1]
|
||||
ld1 {v0.8b}, [x0], x2
|
||||
srshr v16.4h, v16.4h, #6
|
||||
ld1 {v1.8b}, [x0], x2
|
||||
dup v20.8h, v16.h[0]
|
||||
dup v21.8h, v16.h[1]
|
||||
ld1 {v2.8b}, [x0], x2
|
||||
dup v22.8h, v16.h[2]
|
||||
dup v23.8h, v16.h[3]
|
||||
ld1 {v3.8b}, [x0], x2
|
||||
trn1 v20.2d, v20.2d, v21.2d
|
||||
ld1 {v4.8b}, [x0], x2
|
||||
trn1 v21.2d, v22.2d, v23.2d
|
||||
ld1 {v5.8b}, [x0], x2
|
||||
neg v22.8h, v20.8h
|
||||
ld1 {v6.8b}, [x0], x2
|
||||
neg v23.8h, v21.8h
|
||||
ld1 {v7.8b}, [x0], x2
|
||||
|
||||
sub x0, x0, #8*FDEC_STRIDE
|
||||
|
||||
sqxtun v20.8b, v20.8h
|
||||
sqxtun v21.8b, v21.8h
|
||||
sqxtun v22.8b, v22.8h
|
||||
sqxtun v23.8b, v23.8h
|
||||
|
||||
uqadd v0.8b, v0.8b, v20.8b
|
||||
uqadd v1.8b, v1.8b, v20.8b
|
||||
uqadd v2.8b, v2.8b, v20.8b
|
||||
uqadd v3.8b, v3.8b, v20.8b
|
||||
uqadd v4.8b, v4.8b, v21.8b
|
||||
uqadd v5.8b, v5.8b, v21.8b
|
||||
uqadd v6.8b, v6.8b, v21.8b
|
||||
uqadd v7.8b, v7.8b, v21.8b
|
||||
uqsub v0.8b, v0.8b, v22.8b
|
||||
uqsub v1.8b, v1.8b, v22.8b
|
||||
uqsub v2.8b, v2.8b, v22.8b
|
||||
uqsub v3.8b, v3.8b, v22.8b
|
||||
uqsub v4.8b, v4.8b, v23.8b
|
||||
uqsub v5.8b, v5.8b, v23.8b
|
||||
uqsub v6.8b, v6.8b, v23.8b
|
||||
uqsub v7.8b, v7.8b, v23.8b
|
||||
|
||||
st1 {v0.8b}, [x0], x2
|
||||
st1 {v1.8b}, [x0], x2
|
||||
st1 {v2.8b}, [x0], x2
|
||||
st1 {v3.8b}, [x0], x2
|
||||
st1 {v4.8b}, [x0], x2
|
||||
st1 {v5.8b}, [x0], x2
|
||||
st1 {v6.8b}, [x0], x2
|
||||
st1 {v7.8b}, [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro ADD16x4_IDCT_DC dc
|
||||
ld1 {v4.16b}, [x0], x3
|
||||
dup v24.8h, \dc[0]
|
||||
dup v25.8h, \dc[1]
|
||||
ld1 {v5.16b}, [x0], x3
|
||||
dup v26.8h, \dc[2]
|
||||
dup v27.8h, \dc[3]
|
||||
ld1 {v6.16b}, [x0], x3
|
||||
trn1 v24.2d, v24.2d, v25.2d
|
||||
ld1 {v7.16b}, [x0], x3
|
||||
trn1 v25.2d, v26.2d, v27.2d
|
||||
neg v26.8h, v24.8h
|
||||
neg v27.8h, v25.8h
|
||||
|
||||
sqxtun v20.8b, v24.8h
|
||||
sqxtun v21.8b, v26.8h
|
||||
sqxtun2 v20.16b, v25.8h
|
||||
sqxtun2 v21.16b, v27.8h
|
||||
|
||||
uqadd v4.16b, v4.16b, v20.16b
|
||||
uqadd v5.16b, v5.16b, v20.16b
|
||||
uqadd v6.16b, v6.16b, v20.16b
|
||||
uqadd v7.16b, v7.16b, v20.16b
|
||||
|
||||
uqsub v4.16b, v4.16b, v21.16b
|
||||
uqsub v5.16b, v5.16b, v21.16b
|
||||
uqsub v6.16b, v6.16b, v21.16b
|
||||
st1 {v4.16b}, [x2], x3
|
||||
uqsub v7.16b, v7.16b, v21.16b
|
||||
st1 {v5.16b}, [x2], x3
|
||||
st1 {v6.16b}, [x2], x3
|
||||
st1 {v7.16b}, [x2], x3
|
||||
.endm
|
||||
|
||||
function add16x16_idct_dc_neon, export=1
|
||||
mov x2, x0
|
||||
mov x3, #FDEC_STRIDE
|
||||
|
||||
ld1 {v0.4h,v1.4h,v2.4h,v3.4h}, [x1]
|
||||
srshr v0.4h, v0.4h, #6
|
||||
srshr v1.4h, v1.4h, #6
|
||||
|
||||
ADD16x4_IDCT_DC v0.h
|
||||
srshr v2.4h, v2.4h, #6
|
||||
ADD16x4_IDCT_DC v1.h
|
||||
srshr v3.4h, v3.4h, #6
|
||||
ADD16x4_IDCT_DC v2.h
|
||||
ADD16x4_IDCT_DC v3.h
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
|
||||
ld1 {\t0\().8b}, [x1], x3
|
||||
ld1 {\t1\().8b}, [x2], x4
|
||||
ld1 {\t2\().8b}, [x1], x3
|
||||
ld1 {\t3\().8b}, [x2], x4
|
||||
usubl \t0\().8h, \t0\().8b, \t1\().8b
|
||||
ld1 {\t4\().8b}, [x1], x3
|
||||
ld1 {\t5\().8b}, [x2], x4
|
||||
usubl \t1\().8h, \t2\().8b, \t3\().8b
|
||||
ld1 {\t6\().8b}, [x1], x3
|
||||
ld1 {\t7\().8b}, [x2], x4
|
||||
add \dst\().8h, \t0\().8h, \t1\().8h
|
||||
usubl \t2\().8h, \t4\().8b, \t5\().8b
|
||||
usubl \t3\().8h, \t6\().8b, \t7\().8b
|
||||
add \dst\().8h, \dst\().8h, \t2\().8h
|
||||
add \dst\().8h, \dst\().8h, \t3\().8h
|
||||
.endm
|
||||
|
||||
function sub8x8_dct_dc_neon, export=1
|
||||
mov x3, #FENC_STRIDE
|
||||
mov x4, #FDEC_STRIDE
|
||||
|
||||
sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
|
||||
|
||||
transpose v2.2d, v3.2d, v0.2d, v1.2d
|
||||
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
|
||||
transpose v2.2d, v3.2d, v0.2d, v1.2d
|
||||
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
|
||||
transpose v2.2d, v3.2d, v0.2d, v1.2d
|
||||
|
||||
addp v0.8h, v2.8h, v3.8h
|
||||
addp v0.8h, v0.8h, v0.8h
|
||||
|
||||
st1 {v0.4h}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function sub8x16_dct_dc_neon, export=1
|
||||
mov x3, #FENC_STRIDE
|
||||
mov x4, #FDEC_STRIDE
|
||||
sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
|
||||
sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23
|
||||
sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31
|
||||
|
||||
addp v4.8h, v0.8h, v2.8h
|
||||
addp v5.8h, v1.8h, v3.8h
|
||||
|
||||
transpose v2.4s, v3.4s, v4.4s, v5.4s
|
||||
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
|
||||
|
||||
transpose v2.4s, v3.4s, v0.4s, v1.4s
|
||||
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
|
||||
|
||||
transpose v2.2d, v3.2d, v0.2d, v1.2d
|
||||
SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
|
||||
|
||||
trn1 v2.2d, v0.2d, v1.2d
|
||||
trn2 v3.2d, v1.2d, v0.2d
|
||||
|
||||
addp v0.8h, v2.8h, v3.8h
|
||||
|
||||
st1 {v0.8h}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function zigzag_interleave_8x8_cavlc_neon, export=1
|
||||
mov x3, #7
|
||||
movi v31.4s, #1
|
||||
ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
|
||||
ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
|
||||
umax v16.8h, v0.8h, v4.8h
|
||||
umax v17.8h, v1.8h, v5.8h
|
||||
umax v18.8h, v2.8h, v6.8h
|
||||
umax v19.8h, v3.8h, v7.8h
|
||||
st1 {v0.8h}, [x0], #16
|
||||
st1 {v4.8h}, [x0], #16
|
||||
umaxp v16.8h, v16.8h, v17.8h
|
||||
umaxp v18.8h, v18.8h, v19.8h
|
||||
st1 {v1.8h}, [x0], #16
|
||||
st1 {v5.8h}, [x0], #16
|
||||
umaxp v16.8h, v16.8h, v18.8h
|
||||
st1 {v2.8h}, [x0], #16
|
||||
st1 {v6.8h}, [x0], #16
|
||||
cmhs v16.4s, v16.4s, v31.4s
|
||||
st1 {v3.8h}, [x0], #16
|
||||
and v16.16b, v16.16b, v31.16b
|
||||
st1 {v7.8h}, [x0], #16
|
||||
st1 {v16.b}[0], [x2], #1
|
||||
st1 {v16.b}[4], [x2], x3
|
||||
st1 {v16.b}[8], [x2], #1
|
||||
st1 {v16.b}[12], [x2]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function zigzag_scan_4x4_frame_neon, export=1
|
||||
movrel x2, scan4x4_frame
|
||||
ld1 {v0.16b,v1.16b}, [x1]
|
||||
ld1 {v16.16b,v17.16b}, [x2]
|
||||
tbl v2.16b, {v0.16b,v1.16b}, v16.16b
|
||||
tbl v3.16b, {v0.16b,v1.16b}, v17.16b
|
||||
st1 {v2.16b,v3.16b}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro zigzag_sub_4x4 f ac
|
||||
function zigzag_sub_4x4\ac\()_\f\()_neon, export=1
|
||||
mov x9, #FENC_STRIDE
|
||||
mov x4, #FDEC_STRIDE
|
||||
movrel x5, sub4x4_\f
|
||||
mov x6, x2
|
||||
ld1 {v0.s}[0], [x1], x9
|
||||
ld1 {v0.s}[1], [x1], x9
|
||||
ld1 {v0.s}[2], [x1], x9
|
||||
ld1 {v0.s}[3], [x1], x9
|
||||
ld1 {v16.16b}, [x5]
|
||||
ld1 {v1.s}[0], [x2], x4
|
||||
ld1 {v1.s}[1], [x2], x4
|
||||
ld1 {v1.s}[2], [x2], x4
|
||||
ld1 {v1.s}[3], [x2], x4
|
||||
tbl v2.16b, {v0.16b}, v16.16b
|
||||
tbl v3.16b, {v1.16b}, v16.16b
|
||||
st1 {v0.s}[0], [x6], x4
|
||||
usubl v4.8h, v2.8b, v3.8b
|
||||
.ifc \ac, ac
|
||||
dup h7, v4.h[0]
|
||||
ins v4.h[0], wzr
|
||||
fmov w5, s7
|
||||
strh w5, [x3]
|
||||
.endif
|
||||
usubl2 v5.8h, v2.16b, v3.16b
|
||||
st1 {v0.s}[1], [x6], x4
|
||||
umax v6.8h, v4.8h, v5.8h
|
||||
umaxv h6, v6.8h
|
||||
st1 {v0.s}[2], [x6], x4
|
||||
fmov w7, s6
|
||||
st1 {v0.s}[3], [x6], x4
|
||||
cmp w7, #0
|
||||
st1 {v4.8h,v5.8h}, [x0]
|
||||
cset w0, ne
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
zigzag_sub_4x4 field
|
||||
zigzag_sub_4x4 field, ac
|
||||
zigzag_sub_4x4 frame
|
||||
zigzag_sub_4x4 frame, ac
|
||||
|
||||
function zigzag_scan_4x4_field_neon, export=1
|
||||
movrel x2, scan4x4_field
|
||||
ld1 {v0.8h,v1.8h}, [x1]
|
||||
ld1 {v16.16b}, [x2]
|
||||
tbl v0.16b, {v0.16b}, v16.16b
|
||||
st1 {v0.8h,v1.8h}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function zigzag_scan_8x8_frame_neon, export=1
|
||||
movrel x2, scan8x8_frame
|
||||
ld1 {v0.8h,v1.8h}, [x1], #32
|
||||
ld1 {v2.8h,v3.8h}, [x1], #32
|
||||
ld1 {v4.8h,v5.8h}, [x1], #32
|
||||
ld1 {v6.8h,v7.8h}, [x1]
|
||||
ld1 {v16.16b,v17.16b}, [x2], #32
|
||||
ld1 {v18.16b,v19.16b}, [x2], #32
|
||||
ld1 {v20.16b,v21.16b}, [x2], #32
|
||||
ld1 {v22.16b,v23.16b}, [x2], #32
|
||||
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
|
||||
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
|
||||
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
|
||||
tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
|
||||
tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
|
||||
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
|
||||
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
|
||||
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
|
||||
mov v25.h[6], v4.h[0]
|
||||
mov v25.h[7], v5.h[0]
|
||||
mov v26.h[0], v4.h[1]
|
||||
mov v27.h[4], v7.h[0]
|
||||
mov v28.h[7], v4.h[4]
|
||||
mov v29.h[7], v3.h[6]
|
||||
mov v30.h[0], v2.h[7]
|
||||
mov v30.h[1], v3.h[7]
|
||||
st1 {v24.8h,v25.8h}, [x0], #32
|
||||
st1 {v26.8h,v27.8h}, [x0], #32
|
||||
st1 {v28.8h,v29.8h}, [x0], #32
|
||||
st1 {v30.8h,v31.8h}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
#define Z(z) 2*(z), 2*(z)+1
|
||||
#define T(x,y) Z(x*8+y)
|
||||
const scan8x8_frame, align=5
|
||||
.byte T(0,0), T(1,0), T(0,1), T(0,2)
|
||||
.byte T(1,1), T(2,0), T(3,0), T(2,1)
|
||||
.byte T(1,2), T(0,3), T(0,4), T(1,3)
|
||||
.byte T(2,2), T(3,1), T(4,0), T(5,0)
|
||||
.byte T(4,1), T(3,2), T(2,3), T(1,4)
|
||||
.byte T(0,5), T(0,6), T(1,5), T(2,4)
|
||||
#undef T
|
||||
#define T(x,y) Z((x-3)*8+y)
|
||||
.byte T(3,3), T(4,2), T(5,1), T(6,0)
|
||||
.byte T(7,0), T(6,1), T(5,2), T(4,3)
|
||||
#undef T
|
||||
#define T(x,y) Z((x-0)*8+y)
|
||||
.byte T(3,4), T(2,5), T(1,6), T(0,7)
|
||||
.byte T(1,7), T(2,6), T(3,5), T(4,4)
|
||||
#undef T
|
||||
#define T(x,y) Z((x-4)*8+y)
|
||||
.byte T(5,3), T(6,2), T(7,1), T(7,2)
|
||||
.byte T(6,3), T(5,4), T(4,5), T(3,6)
|
||||
.byte T(2,7), T(3,7), T(4,6), T(5,5)
|
||||
.byte T(6,4), T(7,3), T(7,4), T(6,5)
|
||||
.byte T(5,6), T(4,7), T(5,7), T(6,6)
|
||||
.byte T(7,5), T(7,6), T(6,7), T(7,7)
|
||||
endconst
|
||||
|
||||
function zigzag_scan_8x8_field_neon, export=1
|
||||
movrel x2, scan8x8_field
|
||||
ld1 {v0.8h,v1.8h}, [x1], #32
|
||||
ld1 {v2.8h,v3.8h}, [x1], #32
|
||||
ld1 {v4.8h,v5.8h}, [x1], #32
|
||||
ld1 {v6.8h,v7.8h}, [x1]
|
||||
ld1 {v16.16b,v17.16b}, [x2], #32
|
||||
ld1 {v18.16b,v19.16b}, [x2], #32
|
||||
ld1 {v20.16b,v21.16b}, [x2], #32
|
||||
ld1 {v22.16b}, [x2]
|
||||
ext v31.16b, v7.16b, v7.16b, #4
|
||||
tbl v24.16b, {v0.16b,v1.16b}, v16.16b
|
||||
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
|
||||
tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
|
||||
tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
|
||||
tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
|
||||
tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b
|
||||
tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b
|
||||
ext v31.16b, v6.16b, v31.16b, #12
|
||||
st1 {v24.8h,v25.8h}, [x0], #32
|
||||
st1 {v26.8h,v27.8h}, [x0], #32
|
||||
st1 {v28.8h,v29.8h}, [x0], #32
|
||||
st1 {v30.8h,v31.8h}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro zigzag_sub8x8 f
|
||||
function zigzag_sub_8x8_\f\()_neon, export=1
|
||||
movrel x4, sub8x8_\f
|
||||
mov x5, #FENC_STRIDE
|
||||
mov x6, #FDEC_STRIDE
|
||||
mov x7, x2
|
||||
ld1 {v0.d}[0], [x1], x5
|
||||
ld1 {v0.d}[1], [x1], x5
|
||||
ld1 {v1.d}[0], [x1], x5
|
||||
ld1 {v1.d}[1], [x1], x5
|
||||
ld1 {v2.d}[0], [x1], x5
|
||||
ld1 {v2.d}[1], [x1], x5
|
||||
ld1 {v3.d}[0], [x1], x5
|
||||
ld1 {v3.d}[1], [x1]
|
||||
ld1 {v4.d}[0], [x2], x6
|
||||
ld1 {v4.d}[1], [x2], x6
|
||||
ld1 {v5.d}[0], [x2], x6
|
||||
ld1 {v5.d}[1], [x2], x6
|
||||
ld1 {v6.d}[0], [x2], x6
|
||||
ld1 {v6.d}[1], [x2], x6
|
||||
ld1 {v7.d}[0], [x2], x6
|
||||
ld1 {v7.d}[1], [x2]
|
||||
ld1 {v16.16b,v17.16b}, [x4], #32
|
||||
ld1 {v18.16b,v19.16b}, [x4], #32
|
||||
tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
|
||||
tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
|
||||
tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
|
||||
tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
|
||||
tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
|
||||
tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
|
||||
tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
|
||||
tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
|
||||
usubl v4.8h, v24.8b, v28.8b
|
||||
usubl2 v5.8h, v24.16b, v28.16b
|
||||
usubl v6.8h, v25.8b, v29.8b
|
||||
usubl2 v7.8h, v25.16b, v29.16b
|
||||
usubl v16.8h, v26.8b, v30.8b
|
||||
usubl2 v17.8h, v26.16b, v30.16b
|
||||
usubl v18.8h, v27.8b, v31.8b
|
||||
usubl2 v19.8h, v27.16b, v31.16b
|
||||
umax v20.8h, v4.8h, v5.8h
|
||||
umax v21.8h, v6.8h, v7.8h
|
||||
umax v22.8h, v16.8h, v17.8h
|
||||
umax v23.8h, v18.8h, v19.8h
|
||||
umax v20.8h, v20.8h, v21.8h
|
||||
umax v21.8h, v22.8h, v23.8h
|
||||
umax v20.8h, v20.8h, v21.8h
|
||||
umaxv h22, v20.8h
|
||||
st1 {v0.d}[0], [x7], x6
|
||||
st1 {v0.d}[1], [x7], x6
|
||||
st1 {v1.d}[0], [x7], x6
|
||||
st1 {v1.d}[1], [x7], x6
|
||||
st1 {v2.d}[0], [x7], x6
|
||||
st1 {v2.d}[1], [x7], x6
|
||||
st1 {v3.d}[0], [x7], x6
|
||||
st1 {v3.d}[1], [x7]
|
||||
st1 {v4.8h,v5.8h}, [x0], #32
|
||||
st1 {v6.8h,v7.8h}, [x0], #32
|
||||
st1 {v16.8h,v17.8h}, [x0], #32
|
||||
st1 {v18.8h,v19.8h}, [x0]
|
||||
fmov w9, s22
|
||||
cmp w9, #0
|
||||
cset w0, ne
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
zigzag_sub8x8 field
|
||||
zigzag_sub8x8 frame
|
||||
|
||||
#undef T
|
||||
#define T(x,y) Z(x*8+y)
|
||||
const scan8x8_field, align=5
|
||||
.byte T(0,0), T(0,1), T(0,2), T(1,0)
|
||||
.byte T(1,1), T(0,3), T(0,4), T(1,2)
|
||||
.byte T(2,0), T(1,3), T(0,5), T(0,6)
|
||||
.byte T(0,7), T(1,4), T(2,1), T(3,0)
|
||||
#undef T
|
||||
#define T(x,y) Z((x-1)*8+y)
|
||||
.byte T(2,2), T(1,5), T(1,6), T(1,7)
|
||||
.byte T(2,3), T(3,1), T(4,0), T(3,2)
|
||||
#undef T
|
||||
#define T(x,y) Z((x-2)*8+y)
|
||||
.byte T(2,4), T(2,5), T(2,6), T(2,7)
|
||||
.byte T(3,3), T(4,1), T(5,0), T(4,2)
|
||||
#undef T
|
||||
#define T(x,y) Z((x-3)*8+y)
|
||||
.byte T(3,4), T(3,5), T(3,6), T(3,7)
|
||||
.byte T(4,3), T(5,1), T(6,0), T(5,2)
|
||||
#undef T
|
||||
#define T(x,y) Z((x-4)*8+y)
|
||||
.byte T(4,4), T(4,5), T(4,6), T(4,7)
|
||||
.byte T(5,3), T(6,1), T(6,2), T(5,4)
|
||||
#undef T
|
||||
#define T(x,y) Z((x-5)*8+y)
|
||||
.byte T(5,5), T(5,6), T(5,7), T(6,3)
|
||||
.byte T(7,0), T(7,1), T(6,4), T(6,5)
|
||||
endconst
|
||||
|
||||
|
||||
#undef T
|
||||
#define T(y,x) x*8+y
|
||||
const sub8x8_frame, align=5
|
||||
.byte T(0,0), T(1,0), T(0,1), T(0,2)
|
||||
.byte T(1,1), T(2,0), T(3,0), T(2,1)
|
||||
.byte T(1,2), T(0,3), T(0,4), T(1,3)
|
||||
.byte T(2,2), T(3,1), T(4,0), T(5,0)
|
||||
.byte T(4,1), T(3,2), T(2,3), T(1,4)
|
||||
.byte T(0,5), T(0,6), T(1,5), T(2,4)
|
||||
.byte T(3,3), T(4,2), T(5,1), T(6,0)
|
||||
.byte T(7,0), T(6,1), T(5,2), T(4,3)
|
||||
.byte T(3,4), T(2,5), T(1,6), T(0,7)
|
||||
.byte T(1,7), T(2,6), T(3,5), T(4,4)
|
||||
.byte T(5,3), T(6,2), T(7,1), T(7,2)
|
||||
.byte T(6,3), T(5,4), T(4,5), T(3,6)
|
||||
.byte T(2,7), T(3,7), T(4,6), T(5,5)
|
||||
.byte T(6,4), T(7,3), T(7,4), T(6,5)
|
||||
.byte T(5,6), T(4,7), T(5,7), T(6,6)
|
||||
.byte T(7,5), T(7,6), T(6,7), T(7,7)
|
||||
endconst
|
||||
|
||||
const sub8x8_field, align=5
|
||||
.byte T(0,0), T(0,1), T(0,2), T(1,0)
|
||||
.byte T(1,1), T(0,3), T(0,4), T(1,2)
|
||||
.byte T(2,0), T(1,3), T(0,5), T(0,6)
|
||||
.byte T(0,7), T(1,4), T(2,1), T(3,0)
|
||||
.byte T(2,2), T(1,5), T(1,6), T(1,7)
|
||||
.byte T(2,3), T(3,1), T(4,0), T(3,2)
|
||||
.byte T(2,4), T(2,5), T(2,6), T(2,7)
|
||||
.byte T(3,3), T(4,1), T(5,0), T(4,2)
|
||||
.byte T(3,4), T(3,5), T(3,6), T(3,7)
|
||||
.byte T(4,3), T(5,1), T(6,0), T(5,2)
|
||||
.byte T(4,4), T(4,5), T(4,6), T(4,7)
|
||||
.byte T(5,3), T(6,1), T(6,2), T(5,4)
|
||||
.byte T(5,5), T(5,6), T(5,7), T(6,3)
|
||||
.byte T(7,0), T(7,1), T(6,4), T(6,5)
|
||||
.byte T(6,6), T(6,7), T(7,2), T(7,3)
|
||||
.byte T(7,4), T(7,5), T(7,6), T(7,7)
|
||||
endconst
|
||||
103
common/aarch64/dct.h
Normal file
103
common/aarch64/dct.h
Normal file
@@ -0,0 +1,103 @@
|
||||
/*****************************************************************************
|
||||
* dct.h: aarch64 transform and zigzag
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_AARCH64_DCT_H
|
||||
#define X264_AARCH64_DCT_H
|
||||
|
||||
#define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
|
||||
void x264_dct4x4dc_neon( int16_t d[16] );
|
||||
#define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
|
||||
void x264_idct4x4dc_neon( int16_t d[16] );
|
||||
|
||||
#define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
|
||||
void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
|
||||
void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
|
||||
void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
|
||||
#define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
|
||||
void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
|
||||
#define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
|
||||
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
|
||||
#define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
|
||||
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
|
||||
|
||||
#define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
|
||||
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
|
||||
#define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
|
||||
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
|
||||
#define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
|
||||
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
|
||||
void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
|
||||
|
||||
#define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
|
||||
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
|
||||
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
|
||||
|
||||
#define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
|
||||
void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
|
||||
#define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
|
||||
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
|
||||
|
||||
#define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
|
||||
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
|
||||
#define x264_zigzag_scan_4x4_field_neon x264_template(zigzag_scan_4x4_field_neon)
|
||||
void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
|
||||
#define x264_zigzag_scan_8x8_frame_neon x264_template(zigzag_scan_8x8_frame_neon)
|
||||
void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
|
||||
#define x264_zigzag_scan_8x8_field_neon x264_template(zigzag_scan_8x8_field_neon)
|
||||
void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] );
|
||||
|
||||
#define x264_zigzag_sub_4x4_field_neon x264_template(zigzag_sub_4x4_field_neon)
|
||||
int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
|
||||
#define x264_zigzag_sub_4x4ac_field_neon x264_template(zigzag_sub_4x4ac_field_neon)
|
||||
int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
|
||||
#define x264_zigzag_sub_4x4_frame_neon x264_template(zigzag_sub_4x4_frame_neon)
|
||||
int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
|
||||
#define x264_zigzag_sub_4x4ac_frame_neon x264_template(zigzag_sub_4x4ac_frame_neon)
|
||||
int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
|
||||
|
||||
#define x264_zigzag_sub_8x8_field_neon x264_template(zigzag_sub_8x8_field_neon)
|
||||
int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
|
||||
#define x264_zigzag_sub_8x8_frame_neon x264_template(zigzag_sub_8x8_frame_neon)
|
||||
int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
|
||||
|
||||
#define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon)
|
||||
void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
|
||||
|
||||
#define x264_sub4x4_dct_sve x264_template(sub4x4_dct_sve)
|
||||
void x264_sub4x4_dct_sve( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
|
||||
|
||||
#define x264_add4x4_idct_sve2 x264_template(add4x4_idct_sve2)
|
||||
void x264_add4x4_idct_sve2( uint8_t *p_dst, int16_t dct[16] );
|
||||
|
||||
#define x264_zigzag_interleave_8x8_cavlc_sve x264_template(zigzag_interleave_8x8_cavlc_sve)
|
||||
void x264_zigzag_interleave_8x8_cavlc_sve( dctcoef *dst, dctcoef *src, uint8_t *nnz );
|
||||
|
||||
#endif
|
||||
43
common/aarch64/deblock-a-common.S
Normal file
43
common/aarch64/deblock-a-common.S
Normal file
@@ -0,0 +1,43 @@
|
||||
/*****************************************************************************
|
||||
* deblock-a-common.S: aarch64 deblocking
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: Mans Rullgard <mans@mansr.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
* David Chen <david.chen@myais.com.cn>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
// This file contains the NEON macros that are intended to be used by
|
||||
// the SVE/SVE2 functions as well
|
||||
|
||||
.macro h264_loop_filter_start
|
||||
cmp w2, #0
|
||||
ldr w6, [x4]
|
||||
ccmp w3, #0, #0, ne
|
||||
mov v24.s[0], w6
|
||||
and w8, w6, w6, lsl #16
|
||||
b.eq 1f
|
||||
ands w8, w8, w8, lsl #8
|
||||
b.ge 2f
|
||||
1:
|
||||
ret
|
||||
2:
|
||||
.endm
|
||||
98
common/aarch64/deblock-a-sve.S
Normal file
98
common/aarch64/deblock-a-sve.S
Normal file
@@ -0,0 +1,98 @@
|
||||
/*****************************************************************************
|
||||
* deblock-a-sve.S: aarch64 deblocking
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Chen <david.chen@myais.com.cn>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
#include "deblock-a-common.S"
|
||||
|
||||
ENABLE_SVE
|
||||
|
||||
.macro h264_loop_filter_chroma_sve
|
||||
ptrue p0.b, vl16
|
||||
|
||||
dup v22.16b, w2 // alpha
|
||||
uxtl v24.8h, v24.8b
|
||||
uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
|
||||
uxtl v4.8h, v0.8b
|
||||
uxtl2 v5.8h, v0.16b
|
||||
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
|
||||
usubw v4.8h, v4.8h, v16.8b
|
||||
usubw2 v5.8h, v5.8h, v16.16b
|
||||
sli v24.8h, v24.8h, #8
|
||||
shl v4.8h, v4.8h, #2
|
||||
shl v5.8h, v5.8h, #2
|
||||
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
|
||||
uxtl v24.4s, v24.4h
|
||||
uaddw v4.8h, v4.8h, v18.8b
|
||||
uaddw2 v5.8h, v5.8h, v18.16b
|
||||
|
||||
cmphi p1.b, p0/z, z22.b, z26.b
|
||||
usubw v4.8h, v4.8h, v2.8b
|
||||
usubw2 v5.8h, v5.8h, v2.16b
|
||||
sli v24.4s, v24.4s, #16
|
||||
dup v22.16b, w3 // beta
|
||||
rshrn v4.8b, v4.8h, #3
|
||||
rshrn2 v4.16b, v5.8h, #3
|
||||
cmphi p2.b, p0/z, z22.b, z28.b
|
||||
cmphi p3.b, p0/z, z22.b, z30.b
|
||||
smin v4.16b, v4.16b, v24.16b
|
||||
neg v25.16b, v24.16b
|
||||
and p1.b, p0/z, p1.b, p2.b
|
||||
smax v4.16b, v4.16b, v25.16b
|
||||
and p1.b, p0/z, p1.b, p3.b
|
||||
uxtl v22.8h, v0.8b
|
||||
uxtl2 v23.8h, v0.16b
|
||||
|
||||
uxtl v28.8h, v16.8b
|
||||
uxtl2 v29.8h, v16.16b
|
||||
saddw v28.8h, v28.8h, v4.8b
|
||||
saddw2 v29.8h, v29.8h, v4.16b
|
||||
ssubw v22.8h, v22.8h, v4.8b
|
||||
ssubw2 v23.8h, v23.8h, v4.16b
|
||||
sqxtun v16.8b, v28.8h
|
||||
sqxtun v0.8b, v22.8h
|
||||
sqxtun2 v16.16b, v29.8h
|
||||
sqxtun2 v0.16b, v23.8h
|
||||
.endm
|
||||
|
||||
function deblock_v_chroma_sve, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
// No performance improvement if sve load is used. So, continue using
|
||||
// NEON load here
|
||||
ld1 {v18.16b}, [x0], x1
|
||||
ld1 {v16.16b}, [x0], x1
|
||||
ld1 {v0.16b}, [x0], x1
|
||||
ld1 {v2.16b}, [x0]
|
||||
|
||||
h264_loop_filter_chroma_sve
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1b {z16.b}, p1, [x0]
|
||||
add x0, x0, x1
|
||||
st1b {z0.b}, p1, [x0]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
800
common/aarch64/deblock-a.S
Normal file
800
common/aarch64/deblock-a.S
Normal file
@@ -0,0 +1,800 @@
|
||||
/*****************************************************************************
|
||||
* deblock.S: aarch64 deblocking
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: Mans Rullgard <mans@mansr.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
#include "deblock-a-common.S"
|
||||
|
||||
.macro h264_loop_filter_luma
|
||||
dup v22.16b, w2 // alpha
|
||||
uxtl v24.8h, v24.8b
|
||||
uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
|
||||
uxtl v24.4s, v24.4h
|
||||
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
|
||||
sli v24.8h, v24.8h, #8
|
||||
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
|
||||
sli v24.4s, v24.4s, #16
|
||||
cmhi v21.16b, v22.16b, v21.16b // < alpha
|
||||
dup v22.16b, w3 // beta
|
||||
cmlt v23.16b, v24.16b, #0
|
||||
cmhi v28.16b, v22.16b, v28.16b // < beta
|
||||
cmhi v30.16b, v22.16b, v30.16b // < beta
|
||||
bic v21.16b, v21.16b, v23.16b
|
||||
uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
|
||||
and v21.16b, v21.16b, v28.16b
|
||||
uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
|
||||
cmhi v17.16b, v22.16b, v17.16b // < beta
|
||||
and v21.16b, v21.16b, v30.16b
|
||||
cmhi v19.16b, v22.16b, v19.16b // < beta
|
||||
and v17.16b, v17.16b, v21.16b
|
||||
and v19.16b, v19.16b, v21.16b
|
||||
and v24.16b, v24.16b, v21.16b
|
||||
urhadd v28.16b, v16.16b, v0.16b
|
||||
sub v21.16b, v24.16b, v17.16b
|
||||
uqadd v23.16b, v18.16b, v24.16b
|
||||
uhadd v20.16b, v20.16b, v28.16b
|
||||
sub v21.16b, v21.16b, v19.16b
|
||||
uhadd v28.16b, v4.16b, v28.16b
|
||||
umin v23.16b, v23.16b, v20.16b
|
||||
uqsub v22.16b, v18.16b, v24.16b
|
||||
uqadd v4.16b, v2.16b, v24.16b
|
||||
umax v23.16b, v23.16b, v22.16b
|
||||
uqsub v22.16b, v2.16b, v24.16b
|
||||
umin v28.16b, v4.16b, v28.16b
|
||||
uxtl v4.8h, v0.8b
|
||||
umax v28.16b, v28.16b, v22.16b
|
||||
uxtl2 v20.8h, v0.16b
|
||||
usubw v4.8h, v4.8h, v16.8b
|
||||
usubw2 v20.8h, v20.8h, v16.16b
|
||||
shl v4.8h, v4.8h, #2
|
||||
shl v20.8h, v20.8h, #2
|
||||
uaddw v4.8h, v4.8h, v18.8b
|
||||
uaddw2 v20.8h, v20.8h, v18.16b
|
||||
usubw v4.8h, v4.8h, v2.8b
|
||||
usubw2 v20.8h, v20.8h, v2.16b
|
||||
rshrn v4.8b, v4.8h, #3
|
||||
rshrn2 v4.16b, v20.8h, #3
|
||||
bsl v17.16b, v23.16b, v18.16b
|
||||
bsl v19.16b, v28.16b, v2.16b
|
||||
neg v23.16b, v21.16b
|
||||
uxtl v28.8h, v16.8b
|
||||
smin v4.16b, v4.16b, v21.16b
|
||||
uxtl2 v21.8h, v16.16b
|
||||
smax v4.16b, v4.16b, v23.16b
|
||||
uxtl v22.8h, v0.8b
|
||||
uxtl2 v24.8h, v0.16b
|
||||
saddw v28.8h, v28.8h, v4.8b
|
||||
saddw2 v21.8h, v21.8h, v4.16b
|
||||
ssubw v22.8h, v22.8h, v4.8b
|
||||
ssubw2 v24.8h, v24.8h, v4.16b
|
||||
sqxtun v16.8b, v28.8h
|
||||
sqxtun2 v16.16b, v21.8h
|
||||
sqxtun v0.8b, v22.8h
|
||||
sqxtun2 v0.16b, v24.8h
|
||||
.endm
|
||||
|
||||
function deblock_v_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
ld1 {v0.16b}, [x0], x1
|
||||
ld1 {v2.16b}, [x0], x1
|
||||
ld1 {v4.16b}, [x0], x1
|
||||
sub x0, x0, x1, lsl #2
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v20.16b}, [x0], x1
|
||||
ld1 {v18.16b}, [x0], x1
|
||||
ld1 {v16.16b}, [x0], x1
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v17.16b}, [x0], x1
|
||||
st1 {v16.16b}, [x0], x1
|
||||
st1 {v0.16b}, [x0], x1
|
||||
st1 {v19.16b}, [x0]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function deblock_h_luma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, #4
|
||||
ld1 {v6.8b}, [x0], x1
|
||||
ld1 {v20.8b}, [x0], x1
|
||||
ld1 {v18.8b}, [x0], x1
|
||||
ld1 {v16.8b}, [x0], x1
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v2.8b}, [x0], x1
|
||||
ld1 {v4.8b}, [x0], x1
|
||||
ld1 {v26.8b}, [x0], x1
|
||||
ld1 {v6.d}[1], [x0], x1
|
||||
ld1 {v20.d}[1], [x0], x1
|
||||
ld1 {v18.d}[1], [x0], x1
|
||||
ld1 {v16.d}[1], [x0], x1
|
||||
ld1 {v0.d}[1], [x0], x1
|
||||
ld1 {v2.d}[1], [x0], x1
|
||||
ld1 {v4.d}[1], [x0], x1
|
||||
ld1 {v26.d}[1], [x0], x1
|
||||
|
||||
transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
|
||||
|
||||
sub x0, x0, x1, lsl #4
|
||||
add x0, x0, #2
|
||||
st1 {v17.s}[0], [x0], x1
|
||||
st1 {v16.s}[0], [x0], x1
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
st1 {v19.s}[0], [x0], x1
|
||||
st1 {v17.s}[1], [x0], x1
|
||||
st1 {v16.s}[1], [x0], x1
|
||||
st1 {v0.s}[1], [x0], x1
|
||||
st1 {v19.s}[1], [x0], x1
|
||||
st1 {v17.s}[2], [x0], x1
|
||||
st1 {v16.s}[2], [x0], x1
|
||||
st1 {v0.s}[2], [x0], x1
|
||||
st1 {v19.s}[2], [x0], x1
|
||||
st1 {v17.s}[3], [x0], x1
|
||||
st1 {v16.s}[3], [x0], x1
|
||||
st1 {v0.s}[3], [x0], x1
|
||||
st1 {v19.s}[3], [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_start_intra
|
||||
orr w4, w2, w3
|
||||
cmp w4, #0
|
||||
b.ne 1f
|
||||
ret
|
||||
1:
|
||||
dup v30.16b, w2 // alpha
|
||||
dup v31.16b, w3 // beta
|
||||
.endm
|
||||
|
||||
.macro h264_loop_filter_luma_intra
|
||||
uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
|
||||
uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
|
||||
uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
|
||||
cmhi v19.16b, v30.16b, v16.16b // < alpha
|
||||
cmhi v17.16b, v31.16b, v17.16b // < beta
|
||||
cmhi v18.16b, v31.16b, v18.16b // < beta
|
||||
|
||||
movi v29.16b, #2
|
||||
ushr v30.16b, v30.16b, #2 // alpha >> 2
|
||||
add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
|
||||
cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
|
||||
|
||||
and v19.16b, v19.16b, v17.16b
|
||||
and v19.16b, v19.16b, v18.16b
|
||||
shrn v20.8b, v19.8h, #4
|
||||
mov x4, v20.d[0]
|
||||
cbz x4, 9f
|
||||
|
||||
ushll v20.8h, v6.8b, #1
|
||||
ushll v22.8h, v1.8b, #1
|
||||
ushll2 v21.8h, v6.16b, #1
|
||||
ushll2 v23.8h, v1.16b, #1
|
||||
uaddw v20.8h, v20.8h, v7.8b
|
||||
uaddw v22.8h, v22.8h, v0.8b
|
||||
uaddw2 v21.8h, v21.8h, v7.16b
|
||||
uaddw2 v23.8h, v23.8h, v0.16b
|
||||
uaddw v20.8h, v20.8h, v1.8b
|
||||
uaddw v22.8h, v22.8h, v6.8b
|
||||
uaddw2 v21.8h, v21.8h, v1.16b
|
||||
uaddw2 v23.8h, v23.8h, v6.16b
|
||||
|
||||
rshrn v24.8b, v20.8h, #2 // p0'_1
|
||||
rshrn v25.8b, v22.8h, #2 // q0'_1
|
||||
rshrn2 v24.16b, v21.8h, #2 // p0'_1
|
||||
rshrn2 v25.16b, v23.8h, #2 // q0'_1
|
||||
|
||||
uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
|
||||
uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
|
||||
cmhi v17.16b, v31.16b, v17.16b // < beta
|
||||
cmhi v18.16b, v31.16b, v18.16b // < beta
|
||||
|
||||
and v17.16b, v16.16b, v17.16b // if_2 && if_3
|
||||
and v18.16b, v16.16b, v18.16b // if_2 && if_4
|
||||
|
||||
not v30.16b, v17.16b
|
||||
not v31.16b, v18.16b
|
||||
|
||||
and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
|
||||
and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
|
||||
|
||||
and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
|
||||
and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
|
||||
|
||||
//calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
|
||||
uaddl v26.8h, v5.8b, v7.8b
|
||||
uaddl2 v27.8h, v5.16b, v7.16b
|
||||
uaddw v26.8h, v26.8h, v0.8b
|
||||
uaddw2 v27.8h, v27.8h, v0.16b
|
||||
add v20.8h, v20.8h, v26.8h
|
||||
add v21.8h, v21.8h, v27.8h
|
||||
uaddw v20.8h, v20.8h, v0.8b
|
||||
uaddw2 v21.8h, v21.8h, v0.16b
|
||||
rshrn v20.8b, v20.8h, #3 // p0'_2
|
||||
rshrn2 v20.16b, v21.8h, #3 // p0'_2
|
||||
uaddw v26.8h, v26.8h, v6.8b
|
||||
uaddw2 v27.8h, v27.8h, v6.16b
|
||||
rshrn v21.8b, v26.8h, #2 // p1'_2
|
||||
rshrn2 v21.16b, v27.8h, #2 // p1'_2
|
||||
uaddl v28.8h, v4.8b, v5.8b
|
||||
uaddl2 v29.8h, v4.16b, v5.16b
|
||||
shl v28.8h, v28.8h, #1
|
||||
shl v29.8h, v29.8h, #1
|
||||
add v28.8h, v28.8h, v26.8h
|
||||
add v29.8h, v29.8h, v27.8h
|
||||
rshrn v19.8b, v28.8h, #3 // p2'_2
|
||||
rshrn2 v19.16b, v29.8h, #3 // p2'_2
|
||||
|
||||
//calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
|
||||
uaddl v26.8h, v2.8b, v0.8b
|
||||
uaddl2 v27.8h, v2.16b, v0.16b
|
||||
uaddw v26.8h, v26.8h, v7.8b
|
||||
uaddw2 v27.8h, v27.8h, v7.16b
|
||||
add v22.8h, v22.8h, v26.8h
|
||||
add v23.8h, v23.8h, v27.8h
|
||||
uaddw v22.8h, v22.8h, v7.8b
|
||||
uaddw2 v23.8h, v23.8h, v7.16b
|
||||
rshrn v22.8b, v22.8h, #3 // q0'_2
|
||||
rshrn2 v22.16b, v23.8h, #3 // q0'_2
|
||||
uaddw v26.8h, v26.8h, v1.8b
|
||||
uaddw2 v27.8h, v27.8h, v1.16b
|
||||
rshrn v23.8b, v26.8h, #2 // q1'_2
|
||||
rshrn2 v23.16b, v27.8h, #2 // q1'_2
|
||||
uaddl v28.8h, v2.8b, v3.8b
|
||||
uaddl2 v29.8h, v2.16b, v3.16b
|
||||
shl v28.8h, v28.8h, #1
|
||||
shl v29.8h, v29.8h, #1
|
||||
add v28.8h, v28.8h, v26.8h
|
||||
add v29.8h, v29.8h, v27.8h
|
||||
rshrn v26.8b, v28.8h, #3 // q2'_2
|
||||
rshrn2 v26.16b, v29.8h, #3 // q2'_2
|
||||
|
||||
bit v7.16b, v24.16b, v30.16b // p0'_1
|
||||
bit v0.16b, v25.16b, v31.16b // q0'_1
|
||||
bit v7.16b, v20.16b, v17.16b // p0'_2
|
||||
bit v6.16b, v21.16b, v17.16b // p1'_2
|
||||
bit v5.16b, v19.16b, v17.16b // p2'_2
|
||||
bit v0.16b, v22.16b, v18.16b // q0'_2
|
||||
bit v1.16b, v23.16b, v18.16b // q1'_2
|
||||
bit v2.16b, v26.16b, v18.16b // q2'_2
|
||||
.endm
|
||||
|
||||
function deblock_v_luma_intra_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
|
||||
ld1 {v0.16b}, [x0], x1 // q0
|
||||
ld1 {v1.16b}, [x0], x1 // q1
|
||||
ld1 {v2.16b}, [x0], x1 // q2
|
||||
ld1 {v3.16b}, [x0], x1 // q3
|
||||
sub x0, x0, x1, lsl #3
|
||||
ld1 {v4.16b}, [x0], x1 // p3
|
||||
ld1 {v5.16b}, [x0], x1 // p2
|
||||
ld1 {v6.16b}, [x0], x1 // p1
|
||||
ld1 {v7.16b}, [x0] // p0
|
||||
|
||||
h264_loop_filter_luma_intra
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v5.16b}, [x0], x1 // p2
|
||||
st1 {v6.16b}, [x0], x1 // p1
|
||||
st1 {v7.16b}, [x0], x1 // p0
|
||||
st1 {v0.16b}, [x0], x1 // q0
|
||||
st1 {v1.16b}, [x0], x1 // q1
|
||||
st1 {v2.16b}, [x0] // q2
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function deblock_h_luma_intra_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
|
||||
sub x0, x0, #4
|
||||
ld1 {v4.8b}, [x0], x1
|
||||
ld1 {v5.8b}, [x0], x1
|
||||
ld1 {v6.8b}, [x0], x1
|
||||
ld1 {v7.8b}, [x0], x1
|
||||
ld1 {v0.8b}, [x0], x1
|
||||
ld1 {v1.8b}, [x0], x1
|
||||
ld1 {v2.8b}, [x0], x1
|
||||
ld1 {v3.8b}, [x0], x1
|
||||
ld1 {v4.d}[1], [x0], x1
|
||||
ld1 {v5.d}[1], [x0], x1
|
||||
ld1 {v6.d}[1], [x0], x1
|
||||
ld1 {v7.d}[1], [x0], x1
|
||||
ld1 {v0.d}[1], [x0], x1
|
||||
ld1 {v1.d}[1], [x0], x1
|
||||
ld1 {v2.d}[1], [x0], x1
|
||||
ld1 {v3.d}[1], [x0], x1
|
||||
|
||||
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
|
||||
|
||||
h264_loop_filter_luma_intra
|
||||
|
||||
transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
|
||||
|
||||
sub x0, x0, x1, lsl #4
|
||||
st1 {v4.8b}, [x0], x1
|
||||
st1 {v5.8b}, [x0], x1
|
||||
st1 {v6.8b}, [x0], x1
|
||||
st1 {v7.8b}, [x0], x1
|
||||
st1 {v0.8b}, [x0], x1
|
||||
st1 {v1.8b}, [x0], x1
|
||||
st1 {v2.8b}, [x0], x1
|
||||
st1 {v3.8b}, [x0], x1
|
||||
st1 {v4.d}[1], [x0], x1
|
||||
st1 {v5.d}[1], [x0], x1
|
||||
st1 {v6.d}[1], [x0], x1
|
||||
st1 {v7.d}[1], [x0], x1
|
||||
st1 {v0.d}[1], [x0], x1
|
||||
st1 {v1.d}[1], [x0], x1
|
||||
st1 {v2.d}[1], [x0], x1
|
||||
st1 {v3.d}[1], [x0], x1
|
||||
9:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma
|
||||
dup v22.16b, w2 // alpha
|
||||
uxtl v24.8h, v24.8b
|
||||
uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
|
||||
uxtl v4.8h, v0.8b
|
||||
uxtl2 v5.8h, v0.16b
|
||||
uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
|
||||
usubw v4.8h, v4.8h, v16.8b
|
||||
usubw2 v5.8h, v5.8h, v16.16b
|
||||
sli v24.8h, v24.8h, #8
|
||||
shl v4.8h, v4.8h, #2
|
||||
shl v5.8h, v5.8h, #2
|
||||
uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
|
||||
uxtl v24.4s, v24.4h
|
||||
uaddw v4.8h, v4.8h, v18.8b
|
||||
uaddw2 v5.8h, v5.8h, v18.16b
|
||||
cmhi v26.16b, v22.16b, v26.16b // < alpha
|
||||
usubw v4.8h, v4.8h, v2.8b
|
||||
usubw2 v5.8h, v5.8h, v2.16b
|
||||
sli v24.4s, v24.4s, #16
|
||||
dup v22.16b, w3 // beta
|
||||
rshrn v4.8b, v4.8h, #3
|
||||
rshrn2 v4.16b, v5.8h, #3
|
||||
cmhi v28.16b, v22.16b, v28.16b // < beta
|
||||
cmhi v30.16b, v22.16b, v30.16b // < beta
|
||||
smin v4.16b, v4.16b, v24.16b
|
||||
neg v25.16b, v24.16b
|
||||
and v26.16b, v26.16b, v28.16b
|
||||
smax v4.16b, v4.16b, v25.16b
|
||||
and v26.16b, v26.16b, v30.16b
|
||||
uxtl v22.8h, v0.8b
|
||||
uxtl2 v23.8h, v0.16b
|
||||
and v4.16b, v4.16b, v26.16b
|
||||
uxtl v28.8h, v16.8b
|
||||
uxtl2 v29.8h, v16.16b
|
||||
saddw v28.8h, v28.8h, v4.8b
|
||||
saddw2 v29.8h, v29.8h, v4.16b
|
||||
ssubw v22.8h, v22.8h, v4.8b
|
||||
ssubw2 v23.8h, v23.8h, v4.16b
|
||||
sqxtun v16.8b, v28.8h
|
||||
sqxtun v0.8b, v22.8h
|
||||
sqxtun2 v16.16b, v29.8h
|
||||
sqxtun2 v0.16b, v23.8h
|
||||
.endm
|
||||
|
||||
function deblock_v_chroma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v18.16b}, [x0], x1
|
||||
ld1 {v16.16b}, [x0], x1
|
||||
ld1 {v0.16b}, [x0], x1
|
||||
ld1 {v2.16b}, [x0]
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v16.16b}, [x0], x1
|
||||
st1 {v0.16b}, [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function deblock_h_chroma_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x0, x0, #4
|
||||
deblock_h_chroma:
|
||||
ld1 {v18.d}[0], [x0], x1
|
||||
ld1 {v16.d}[0], [x0], x1
|
||||
ld1 {v0.d}[0], [x0], x1
|
||||
ld1 {v2.d}[0], [x0], x1
|
||||
ld1 {v18.d}[1], [x0], x1
|
||||
ld1 {v16.d}[1], [x0], x1
|
||||
ld1 {v0.d}[1], [x0], x1
|
||||
ld1 {v2.d}[1], [x0], x1
|
||||
|
||||
transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
|
||||
|
||||
sub x0, x0, x1, lsl #3
|
||||
st1 {v18.d}[0], [x0], x1
|
||||
st1 {v16.d}[0], [x0], x1
|
||||
st1 {v0.d}[0], [x0], x1
|
||||
st1 {v2.d}[0], [x0], x1
|
||||
st1 {v18.d}[1], [x0], x1
|
||||
st1 {v16.d}[1], [x0], x1
|
||||
st1 {v0.d}[1], [x0], x1
|
||||
st1 {v2.d}[1], [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function deblock_h_chroma_422_neon, export=1
|
||||
add x5, x0, x1
|
||||
sub x0, x0, #4
|
||||
add x1, x1, x1
|
||||
h264_loop_filter_start
|
||||
mov x7, x30
|
||||
bl deblock_h_chroma
|
||||
mov x30, x7
|
||||
sub x0, x5, #4
|
||||
mov v24.s[0], w6
|
||||
b deblock_h_chroma
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma8
|
||||
dup v22.8b, w2 // alpha
|
||||
uxtl v24.8h, v24.8b
|
||||
uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
|
||||
uxtl v4.8h, v17.8b
|
||||
uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
|
||||
usubw v4.8h, v4.8h, v16.8b
|
||||
sli v24.8h, v24.8h, #8
|
||||
shl v4.8h, v4.8h, #2
|
||||
uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0)
|
||||
uaddw v4.8h, v4.8h, v18.8b
|
||||
cmhi v26.8b, v22.8b, v26.8b // < alpha
|
||||
usubw v4.8h, v4.8h, v19.8b
|
||||
dup v22.8b, w3 // beta
|
||||
rshrn v4.8b, v4.8h, #3
|
||||
cmhi v28.8b, v22.8b, v28.8b // < beta
|
||||
cmhi v30.8b, v22.8b, v30.8b // < beta
|
||||
smin v4.8b, v4.8b, v24.8b
|
||||
neg v25.8b, v24.8b
|
||||
and v26.8b, v26.8b, v28.8b
|
||||
smax v4.8b, v4.8b, v25.8b
|
||||
and v26.8b, v26.8b, v30.8b
|
||||
uxtl v22.8h, v17.8b
|
||||
and v4.8b, v4.8b, v26.8b
|
||||
uxtl v28.8h, v16.8b
|
||||
saddw v28.8h, v28.8h, v4.8b
|
||||
ssubw v22.8h, v22.8h, v4.8b
|
||||
sqxtun v16.8b, v28.8h
|
||||
sqxtun v17.8b, v22.8h
|
||||
.endm
|
||||
|
||||
function deblock_h_chroma_mbaff_neon, export=1
|
||||
h264_loop_filter_start
|
||||
|
||||
sub x4, x0, #4
|
||||
sub x0, x0, #2
|
||||
|
||||
ld1 {v18.8b}, [x4], x1
|
||||
ld1 {v16.8b}, [x4], x1
|
||||
ld1 {v17.8b}, [x4], x1
|
||||
ld1 {v19.8b}, [x4]
|
||||
|
||||
transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31
|
||||
|
||||
h264_loop_filter_chroma8
|
||||
|
||||
st2 {v16.h,v17.h}[0], [x0], x1
|
||||
st2 {v16.h,v17.h}[1], [x0], x1
|
||||
st2 {v16.h,v17.h}[2], [x0], x1
|
||||
st2 {v16.h,v17.h}[3], [x0]
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma_intra width=16
|
||||
uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0)
|
||||
uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0)
|
||||
uabd v28.16b, v19.16b, v17.16b // abs(q1 - q0)
|
||||
cmhi v26.16b, v30.16b, v26.16b // < alpha
|
||||
cmhi v27.16b, v31.16b, v27.16b // < beta
|
||||
cmhi v28.16b, v31.16b, v28.16b // < beta
|
||||
and v26.16b, v26.16b, v27.16b
|
||||
and v26.16b, v26.16b, v28.16b
|
||||
|
||||
ushll v4.8h, v18.8b, #1
|
||||
ushll v6.8h, v19.8b, #1
|
||||
.ifc \width, 16
|
||||
ushll2 v5.8h, v18.16b, #1
|
||||
ushll2 v7.8h, v19.16b, #1
|
||||
uaddl2 v21.8h, v16.16b, v19.16b
|
||||
uaddl2 v23.8h, v17.16b, v18.16b
|
||||
.endif
|
||||
uaddl v20.8h, v16.8b, v19.8b
|
||||
uaddl v22.8h, v17.8b, v18.8b
|
||||
add v20.8h, v20.8h, v4.8h // mlal?
|
||||
add v22.8h, v22.8h, v6.8h
|
||||
.ifc \width, 16
|
||||
add v21.8h, v21.8h, v5.8h
|
||||
add v23.8h, v23.8h, v7.8h
|
||||
.endif
|
||||
uqrshrn v24.8b, v20.8h, #2
|
||||
uqrshrn v25.8b, v22.8h, #2
|
||||
.ifc \width, 16
|
||||
uqrshrn2 v24.16b, v21.8h, #2
|
||||
uqrshrn2 v25.16b, v23.8h, #2
|
||||
.endif
|
||||
bit v16.16b, v24.16b, v26.16b
|
||||
bit v17.16b, v25.16b, v26.16b
|
||||
.endm
|
||||
|
||||
function deblock_v_chroma_intra_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
ld1 {v18.16b}, [x0], x1
|
||||
ld1 {v16.16b}, [x0], x1
|
||||
ld1 {v17.16b}, [x0], x1
|
||||
ld1 {v19.16b}, [x0]
|
||||
|
||||
h264_loop_filter_chroma_intra
|
||||
|
||||
sub x0, x0, x1, lsl #1
|
||||
st1 {v16.16b}, [x0], x1
|
||||
st1 {v17.16b}, [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function deblock_h_chroma_intra_mbaff_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
|
||||
sub x4, x0, #4
|
||||
sub x0, x0, #2
|
||||
ld1 {v18.8b}, [x4], x1
|
||||
ld1 {v16.8b}, [x4], x1
|
||||
ld1 {v17.8b}, [x4], x1
|
||||
ld1 {v19.8b}, [x4], x1
|
||||
|
||||
transpose4x4.h v18, v16, v17, v19, v26, v27, v28, v29
|
||||
|
||||
h264_loop_filter_chroma_intra width=8
|
||||
|
||||
st2 {v16.h,v17.h}[0], [x0], x1
|
||||
st2 {v16.h,v17.h}[1], [x0], x1
|
||||
st2 {v16.h,v17.h}[2], [x0], x1
|
||||
st2 {v16.h,v17.h}[3], [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function deblock_h_chroma_intra_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
|
||||
sub x4, x0, #4
|
||||
sub x0, x0, #2
|
||||
ld1 {v18.d}[0], [x4], x1
|
||||
ld1 {v16.d}[0], [x4], x1
|
||||
ld1 {v17.d}[0], [x4], x1
|
||||
ld1 {v19.d}[0], [x4], x1
|
||||
ld1 {v18.d}[1], [x4], x1
|
||||
ld1 {v16.d}[1], [x4], x1
|
||||
ld1 {v17.d}[1], [x4], x1
|
||||
ld1 {v19.d}[1], [x4], x1
|
||||
|
||||
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
|
||||
|
||||
h264_loop_filter_chroma_intra
|
||||
|
||||
st2 {v16.h,v17.h}[0], [x0], x1
|
||||
st2 {v16.h,v17.h}[1], [x0], x1
|
||||
st2 {v16.h,v17.h}[2], [x0], x1
|
||||
st2 {v16.h,v17.h}[3], [x0], x1
|
||||
st2 {v16.h,v17.h}[4], [x0], x1
|
||||
st2 {v16.h,v17.h}[5], [x0], x1
|
||||
st2 {v16.h,v17.h}[6], [x0], x1
|
||||
st2 {v16.h,v17.h}[7], [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function deblock_h_chroma_422_intra_neon, export=1
|
||||
h264_loop_filter_start_intra
|
||||
|
||||
sub x4, x0, #4
|
||||
sub x0, x0, #2
|
||||
ld1 {v18.d}[0], [x4], x1
|
||||
ld1 {v16.d}[0], [x4], x1
|
||||
ld1 {v17.d}[0], [x4], x1
|
||||
ld1 {v19.d}[0], [x4], x1
|
||||
ld1 {v18.d}[1], [x4], x1
|
||||
ld1 {v16.d}[1], [x4], x1
|
||||
ld1 {v17.d}[1], [x4], x1
|
||||
ld1 {v19.d}[1], [x4], x1
|
||||
|
||||
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
|
||||
|
||||
h264_loop_filter_chroma_intra
|
||||
|
||||
st2 {v16.h,v17.h}[0], [x0], x1
|
||||
st2 {v16.h,v17.h}[1], [x0], x1
|
||||
st2 {v16.h,v17.h}[2], [x0], x1
|
||||
st2 {v16.h,v17.h}[3], [x0], x1
|
||||
st2 {v16.h,v17.h}[4], [x0], x1
|
||||
st2 {v16.h,v17.h}[5], [x0], x1
|
||||
st2 {v16.h,v17.h}[6], [x0], x1
|
||||
st2 {v16.h,v17.h}[7], [x0], x1
|
||||
|
||||
ld1 {v18.d}[0], [x4], x1
|
||||
ld1 {v16.d}[0], [x4], x1
|
||||
ld1 {v17.d}[0], [x4], x1
|
||||
ld1 {v19.d}[0], [x4], x1
|
||||
ld1 {v18.d}[1], [x4], x1
|
||||
ld1 {v16.d}[1], [x4], x1
|
||||
ld1 {v17.d}[1], [x4], x1
|
||||
ld1 {v19.d}[1], [x4], x1
|
||||
|
||||
transpose4x8.h v18, v16, v17, v19, v26, v27, v28, v29
|
||||
|
||||
h264_loop_filter_chroma_intra
|
||||
|
||||
st2 {v16.h,v17.h}[0], [x0], x1
|
||||
st2 {v16.h,v17.h}[1], [x0], x1
|
||||
st2 {v16.h,v17.h}[2], [x0], x1
|
||||
st2 {v16.h,v17.h}[3], [x0], x1
|
||||
st2 {v16.h,v17.h}[4], [x0], x1
|
||||
st2 {v16.h,v17.h}[5], [x0], x1
|
||||
st2 {v16.h,v17.h}[6], [x0], x1
|
||||
st2 {v16.h,v17.h}[7], [x0], x1
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
// void deblock_strength( uint8_t nnz[X264_SCAN8_SIZE],
|
||||
// int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||
// int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
|
||||
// uint8_t bs[2][8][4], int mvy_limit,
|
||||
// int bframe )
|
||||
function deblock_strength_neon, export=1
|
||||
movi v4.16b, #0
|
||||
lsl w4, w4, #8
|
||||
add x3, x3, #32
|
||||
sub w4, w4, #(1<<8)-3
|
||||
movi v5.16b, #0
|
||||
dup v6.8h, w4
|
||||
mov x6, #-32
|
||||
|
||||
bframe:
|
||||
// load bytes ref
|
||||
add x2, x2, #16
|
||||
ld1 {v31.d}[1], [x1], #8
|
||||
ld1 {v1.16b}, [x1], #16
|
||||
movi v0.16b, #0
|
||||
ld1 {v2.16b}, [x1], #16
|
||||
ext v3.16b, v0.16b, v1.16b, #15
|
||||
ext v0.16b, v0.16b, v2.16b, #15
|
||||
unzip v21.4s, v22.4s, v1.4s, v2.4s
|
||||
unzip v23.4s, v20.4s, v3.4s, v0.4s
|
||||
ext v21.16b, v31.16b, v22.16b, #12
|
||||
|
||||
eor v0.16b, v20.16b, v22.16b
|
||||
eor v1.16b, v21.16b, v22.16b
|
||||
orr v4.16b, v4.16b, v0.16b
|
||||
orr v5.16b, v5.16b, v1.16b
|
||||
|
||||
ld1 {v21.8h}, [x2], #16 // mv + 0x10
|
||||
ld1 {v19.8h}, [x2], #16 // mv + 0x20
|
||||
ld1 {v22.8h}, [x2], #16 // mv + 0x30
|
||||
ld1 {v18.8h}, [x2], #16 // mv + 0x40
|
||||
ld1 {v23.8h}, [x2], #16 // mv + 0x50
|
||||
ext v19.16b, v19.16b, v22.16b, #12
|
||||
ext v18.16b, v18.16b, v23.16b, #12
|
||||
sabd v0.8h, v22.8h, v19.8h
|
||||
ld1 {v19.8h}, [x2], #16 // mv + 0x60
|
||||
sabd v1.8h, v23.8h, v18.8h
|
||||
ld1 {v24.8h}, [x2], #16 // mv + 0x70
|
||||
uqxtn v0.8b, v0.8h
|
||||
ld1 {v18.8h}, [x2], #16 // mv + 0x80
|
||||
ld1 {v25.8h}, [x2], #16 // mv + 0x90
|
||||
uqxtn2 v0.16b, v1.8h
|
||||
ext v19.16b, v19.16b, v24.16b, #12
|
||||
ext v18.16b, v18.16b, v25.16b, #12
|
||||
sabd v1.8h, v24.8h, v19.8h
|
||||
sabd v2.8h, v25.8h, v18.8h
|
||||
uqxtn v1.8b, v1.8h
|
||||
uqxtn2 v1.16b, v2.8h
|
||||
|
||||
uqsub v0.16b, v0.16b, v6.16b
|
||||
uqsub v1.16b, v1.16b, v6.16b
|
||||
uqxtn v0.8b, v0.8h
|
||||
uqxtn2 v0.16b, v1.8h
|
||||
|
||||
sabd v1.8h, v22.8h, v23.8h
|
||||
orr v4.16b, v4.16b, v0.16b
|
||||
|
||||
sabd v0.8h, v21.8h, v22.8h
|
||||
sabd v2.8h, v23.8h, v24.8h
|
||||
sabd v3.8h, v24.8h, v25.8h
|
||||
uqxtn v0.8b, v0.8h
|
||||
uqxtn2 v0.16b, v1.8h
|
||||
uqxtn v1.8b, v2.8h
|
||||
uqxtn2 v1.16b, v3.8h
|
||||
|
||||
uqsub v0.16b, v0.16b, v6.16b
|
||||
uqsub v1.16b, v1.16b, v6.16b
|
||||
uqxtn v0.8b, v0.8h
|
||||
uqxtn2 v0.16b, v1.8h
|
||||
subs w5, w5, #1
|
||||
orr v5.16b, v5.16b, v0.16b
|
||||
b.eq bframe
|
||||
|
||||
movi v6.16b, #1
|
||||
// load bytes nnz
|
||||
ld1 {v31.d}[1], [x0], #8
|
||||
ld1 {v1.16b}, [x0], #16
|
||||
movi v0.16b, #0
|
||||
ld1 {v2.16b}, [x0], #16
|
||||
ext v3.16b, v0.16b, v1.16b, #15
|
||||
ext v0.16b, v0.16b, v2.16b, #15
|
||||
unzip v21.4s, v22.4s, v1.4s, v2.4s
|
||||
unzip v23.4s, v20.4s, v3.4s, v0.4s
|
||||
ext v21.16b, v31.16b, v22.16b, #12
|
||||
|
||||
movrel x7, transpose_table
|
||||
ld1 {v7.16b}, [x7]
|
||||
orr v0.16b, v20.16b, v22.16b
|
||||
orr v1.16b, v21.16b, v22.16b
|
||||
umin v0.16b, v0.16b, v6.16b
|
||||
umin v1.16b, v1.16b, v6.16b
|
||||
umin v4.16b, v4.16b, v6.16b // mv ? 1 : 0
|
||||
umin v5.16b, v5.16b, v6.16b
|
||||
add v0.16b, v0.16b, v0.16b // nnz ? 2 : 0
|
||||
add v1.16b, v1.16b, v1.16b
|
||||
umax v4.16b, v4.16b, v0.16b
|
||||
umax v5.16b, v5.16b, v1.16b
|
||||
tbl v6.16b, {v4.16b}, v7.16b
|
||||
st1 {v5.16b}, [x3], x6 // bs[1]
|
||||
st1 {v6.16b}, [x3] // bs[0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
const transpose_table
|
||||
.byte 0, 4, 8, 12
|
||||
.byte 1, 5, 9, 13
|
||||
.byte 2, 6, 10, 14
|
||||
.byte 3, 7, 11, 15
|
||||
endconst
|
||||
61
common/aarch64/deblock.h
Normal file
61
common/aarch64/deblock.h
Normal file
@@ -0,0 +1,61 @@
|
||||
/*****************************************************************************
|
||||
* deblock.h: aarch64 deblocking
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2017-2025 x264 project
|
||||
*
|
||||
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_AARCH64_DEBLOCK_H
|
||||
#define X264_AARCH64_DEBLOCK_H
|
||||
|
||||
#define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
|
||||
void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
|
||||
void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
|
||||
void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
|
||||
void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_strength_neon x264_template(deblock_strength_neon)
|
||||
void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
|
||||
int mvy_limit, int bframe );
|
||||
#define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
|
||||
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
|
||||
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
|
||||
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
|
||||
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
|
||||
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
|
||||
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
|
||||
void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
|
||||
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
|
||||
#define x264_deblock_v_chroma_sve x264_template(deblock_v_chroma_sve)
|
||||
void x264_deblock_v_chroma_sve( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
|
||||
#endif
|
||||
66
common/aarch64/mc-a-common.S
Normal file
66
common/aarch64/mc-a-common.S
Normal file
@@ -0,0 +1,66 @@
|
||||
/****************************************************************************
|
||||
* mc-a-common.S: aarch64 motion compensation
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
* Mans Rullgard <mans@mansr.com>
|
||||
* Stefan Groenroos <stefan.gronroos@gmail.com>
|
||||
* David Chen <david.chen@myais.com.cn>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
// This file contains the NEON macros and functions that are intended to be used by
|
||||
// the SVE/SVE2 functions as well
|
||||
|
||||
#if BIT_DEPTH == 8
|
||||
|
||||
// 0 < weight < 64
|
||||
.macro load_weights_add_add
|
||||
mov w6, w6
|
||||
.endm
|
||||
|
||||
// weight > 64
|
||||
.macro load_weights_add_sub
|
||||
neg w7, w7
|
||||
.endm
|
||||
|
||||
// weight < 0
|
||||
.macro load_weights_sub_add
|
||||
neg w6, w6
|
||||
.endm
|
||||
|
||||
function pixel_avg_w4_neon
|
||||
1: subs w9, w9, #2
|
||||
ld1 {v0.s}[0], [x2], x3
|
||||
ld1 {v2.s}[0], [x4], x5
|
||||
urhadd v0.8b, v0.8b, v2.8b
|
||||
ld1 {v1.s}[0], [x2], x3
|
||||
ld1 {v3.s}[0], [x4], x5
|
||||
urhadd v1.8b, v1.8b, v3.8b
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
st1 {v1.s}[0], [x0], x1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
#else // BIT_DEPTH == 10
|
||||
|
||||
#endif
|
||||
108
common/aarch64/mc-a-sve.S
Normal file
108
common/aarch64/mc-a-sve.S
Normal file
@@ -0,0 +1,108 @@
|
||||
/*****************************************************************************
|
||||
* mc-a-sve.S: aarch64 motion compensation
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Chen <david.chen@myais.com.cn>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
#include "mc-a-common.S"
|
||||
|
||||
ENABLE_SVE
|
||||
|
||||
#if BIT_DEPTH == 8
|
||||
|
||||
// void pixel_avg( uint8_t *dst, intptr_t dst_stride,
|
||||
// uint8_t *src1, intptr_t src1_stride,
|
||||
// uint8_t *src2, intptr_t src2_stride, int weight );
|
||||
.macro AVGH_SVE w h
|
||||
function pixel_avg_\w\()x\h\()_sve, export=1
|
||||
mov w10, #64
|
||||
cmp w6, #32
|
||||
mov w9, #\h
|
||||
b.eq pixel_avg_w\w\()_neon
|
||||
subs w7, w10, w6
|
||||
b.lt pixel_avg_weight_w\w\()_add_sub_sve // weight > 64
|
||||
cmp w6, #0
|
||||
b.ge pixel_avg_weight_w\w\()_add_add_sve
|
||||
b pixel_avg_weight_w\w\()_sub_add_sve // weight < 0
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
AVGH_SVE 4, 2
|
||||
AVGH_SVE 4, 4
|
||||
AVGH_SVE 4, 8
|
||||
AVGH_SVE 4, 16
|
||||
|
||||
// 0 < weight < 64
|
||||
.macro weight_add_add_sve dst, s1, s2, h=
|
||||
mul \dst, \s1, v30.8h
|
||||
mla \dst, \s2, v31.8h
|
||||
.endm
|
||||
|
||||
// weight > 64
|
||||
.macro weight_add_sub_sve dst, s1, s2, h=
|
||||
mul \dst, \s1, v30.8h
|
||||
mls \dst, \s2, v31.8h
|
||||
.endm
|
||||
|
||||
// weight < 0
|
||||
.macro weight_sub_add_sve dst, s1, s2, h=
|
||||
mul \dst, \s2, v31.8h
|
||||
mls \dst, \s1, v30.8h
|
||||
.endm
|
||||
|
||||
.macro AVG_WEIGHT_SVE ext
|
||||
function pixel_avg_weight_w4_\ext\()_sve
|
||||
load_weights_\ext
|
||||
ptrue p0.b, vl8
|
||||
dup v30.8h, w6
|
||||
dup v31.8h, w7
|
||||
1: // height loop
|
||||
subs w9, w9, #2
|
||||
ld1b {z0.h}, p0/z, [x2]
|
||||
add x2, x2, x3
|
||||
ld1b {z1.h}, p0/z, [x4]
|
||||
add x4, x4, x5
|
||||
weight_\ext\()_sve v4.8h, v0.8h, v1.8h
|
||||
ld1b {z2.h}, p0/z, [x2]
|
||||
add x2, x2, x3
|
||||
ld1b {z3.h}, p0/z, [x4]
|
||||
add x4, x4, x5
|
||||
|
||||
sqrshrun v0.8b, v4.8h, #6
|
||||
weight_\ext\()_sve v5.8h, v2.8h, v3.8h
|
||||
st1 {v0.s}[0], [x0], x1
|
||||
sqrshrun v1.8b, v5.8h, #6
|
||||
st1 {v1.s}[0], [x0], x1
|
||||
b.gt 1b
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
AVG_WEIGHT_SVE add_add
|
||||
AVG_WEIGHT_SVE add_sub
|
||||
AVG_WEIGHT_SVE sub_add
|
||||
|
||||
#else // BIT_DEPTH == 10
|
||||
|
||||
|
||||
#endif
|
||||
3935
common/aarch64/mc-a.S
Normal file
3935
common/aarch64/mc-a.S
Normal file
File diff suppressed because it is too large
Load Diff
371
common/aarch64/mc-c.c
Normal file
371
common/aarch64/mc-c.c
Normal file
@@ -0,0 +1,371 @@
|
||||
/*****************************************************************************
|
||||
* mc-c.c: aarch64 motion compensation
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common/common.h"
|
||||
#include "mc.h"
|
||||
|
||||
#define x264_prefetch_ref_aarch64 x264_template(prefetch_ref_aarch64)
|
||||
void x264_prefetch_ref_aarch64( pixel *, intptr_t, int );
|
||||
#define x264_prefetch_fenc_420_aarch64 x264_template(prefetch_fenc_420_aarch64)
|
||||
void x264_prefetch_fenc_420_aarch64( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_prefetch_fenc_422_aarch64 x264_template(prefetch_fenc_422_aarch64)
|
||||
void x264_prefetch_fenc_422_aarch64( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
|
||||
#define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
|
||||
void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
|
||||
#define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
|
||||
void x264_memzero_aligned_neon( void *dst, size_t n );
|
||||
|
||||
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
|
||||
void x264_pixel_avg_16x16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
|
||||
void x264_pixel_avg_16x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
|
||||
void x264_pixel_avg_8x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
|
||||
void x264_pixel_avg_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
|
||||
void x264_pixel_avg_8x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
|
||||
void x264_pixel_avg_4x16_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
|
||||
void x264_pixel_avg_4x8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
|
||||
void x264_pixel_avg_4x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
|
||||
void x264_pixel_avg_4x2_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
|
||||
#define x264_pixel_avg_4x16_sve x264_template(pixel_avg_4x16_sve)
|
||||
void x264_pixel_avg_4x16_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_pixel_avg_4x8_sve x264_template(pixel_avg_4x8_sve)
|
||||
void x264_pixel_avg_4x8_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_pixel_avg_4x4_sve x264_template(pixel_avg_4x4_sve)
|
||||
void x264_pixel_avg_4x4_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_pixel_avg_4x2_sve x264_template(pixel_avg_4x2_sve)
|
||||
void x264_pixel_avg_4x2_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
|
||||
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
|
||||
void x264_pixel_avg2_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
|
||||
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
|
||||
void x264_pixel_avg2_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
|
||||
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
|
||||
void x264_pixel_avg2_w16_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
|
||||
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
|
||||
void x264_pixel_avg2_w20_neon( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
|
||||
|
||||
#define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
|
||||
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
#define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
|
||||
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
#define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
|
||||
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
|
||||
pixel *dstv, intptr_t i_dstv,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
#define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
|
||||
void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
|
||||
pixel *dstb, intptr_t i_dstb,
|
||||
pixel *dstc, intptr_t i_dstc,
|
||||
pixel *src, intptr_t i_src, int pw, int w, int h );
|
||||
#define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
|
||||
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *srcu, intptr_t i_srcu,
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||
|
||||
#define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
|
||||
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
|
||||
#define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
|
||||
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||
#define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
|
||||
void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||
|
||||
#define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
|
||||
#define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
|
||||
#define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
|
||||
#define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
|
||||
#define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
|
||||
#define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
|
||||
#define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
|
||||
#define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
|
||||
#define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
|
||||
#define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
|
||||
#define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
|
||||
#define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
|
||||
#define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
|
||||
#define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
|
||||
#define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
|
||||
#define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
|
||||
#define MC_WEIGHT(func)\
|
||||
void x264_mc_weight_w20##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
|
||||
void x264_mc_weight_w16##func##_neon( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
|
||||
void x264_mc_weight_w8##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
|
||||
void x264_mc_weight_w4##func##_neon ( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int );\
|
||||
\
|
||||
static void (* mc##func##_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, const x264_weight_t *, int ) =\
|
||||
{\
|
||||
x264_mc_weight_w4##func##_neon,\
|
||||
x264_mc_weight_w4##func##_neon,\
|
||||
x264_mc_weight_w8##func##_neon,\
|
||||
x264_mc_weight_w16##func##_neon,\
|
||||
x264_mc_weight_w16##func##_neon,\
|
||||
x264_mc_weight_w20##func##_neon,\
|
||||
};
|
||||
|
||||
MC_WEIGHT()
|
||||
MC_WEIGHT(_nodenom)
|
||||
MC_WEIGHT(_offsetadd)
|
||||
MC_WEIGHT(_offsetsub)
|
||||
|
||||
#define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
|
||||
void x264_mc_copy_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
|
||||
void x264_mc_copy_w8_neon ( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
#define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
|
||||
void x264_mc_copy_w16_neon( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
|
||||
#define x264_mc_chroma_neon x264_template(mc_chroma_neon)
|
||||
void x264_mc_chroma_neon( pixel *, pixel *, intptr_t, pixel *, intptr_t, int, int, int, int );
|
||||
#define x264_integral_init4h_neon x264_template(integral_init4h_neon)
|
||||
void x264_integral_init4h_neon( uint16_t *, pixel *, intptr_t );
|
||||
#define x264_integral_init4v_neon x264_template(integral_init4v_neon)
|
||||
void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
|
||||
#define x264_integral_init8h_neon x264_template(integral_init8h_neon)
|
||||
void x264_integral_init8h_neon( uint16_t *, pixel *, intptr_t );
|
||||
#define x264_integral_init8v_neon x264_template(integral_init8v_neon)
|
||||
void x264_integral_init8v_neon( uint16_t *, intptr_t );
|
||||
#define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
|
||||
void x264_frame_init_lowres_core_neon( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, intptr_t, int, int );
|
||||
|
||||
#define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
|
||||
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
|
||||
|
||||
#define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
|
||||
void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
|
||||
#define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
|
||||
void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
|
||||
|
||||
static void (* const pixel_avg_wtab_neon[6])( pixel *, intptr_t, pixel *, intptr_t, pixel *, int ) =
|
||||
{
|
||||
NULL,
|
||||
x264_pixel_avg2_w4_neon,
|
||||
x264_pixel_avg2_w8_neon,
|
||||
x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
|
||||
x264_pixel_avg2_w16_neon,
|
||||
x264_pixel_avg2_w20_neon,
|
||||
};
|
||||
|
||||
static void (* const mc_copy_wtab_neon[5])( pixel *, intptr_t, pixel *, intptr_t, int ) =
|
||||
{
|
||||
NULL,
|
||||
x264_mc_copy_w4_neon,
|
||||
x264_mc_copy_w8_neon,
|
||||
NULL,
|
||||
x264_mc_copy_w16_neon,
|
||||
};
|
||||
|
||||
static void weight_cache_neon( x264_t *h, x264_weight_t *w )
|
||||
{
|
||||
if( w->i_scale == 1<<w->i_denom )
|
||||
{
|
||||
if( w->i_offset < 0 )
|
||||
{
|
||||
w->weightfn = mc_offsetsub_wtab_neon;
|
||||
w->cachea[0] = -w->i_offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
w->weightfn = mc_offsetadd_wtab_neon;
|
||||
w->cachea[0] = w->i_offset;
|
||||
}
|
||||
}
|
||||
else if( !w->i_denom )
|
||||
w->weightfn = mc_nodenom_wtab_neon;
|
||||
else
|
||||
w->weightfn = mc_wtab_neon;
|
||||
}
|
||||
|
||||
static void mc_luma_neon( pixel *dst, intptr_t i_dst_stride,
|
||||
pixel *src[4], intptr_t i_src_stride,
|
||||
int mvx, int mvy,
|
||||
int i_width, int i_height, const x264_weight_t *weight )
|
||||
{
|
||||
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
|
||||
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
|
||||
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
|
||||
if( (mvy&3) == 3 ) // explicit if() to force conditional add
|
||||
src1 += i_src_stride;
|
||||
|
||||
if( qpel_idx & 5 ) /* qpel interpolation needed */
|
||||
{
|
||||
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
|
||||
pixel_avg_wtab_neon[i_width>>2](
|
||||
dst, i_dst_stride, src1, i_src_stride,
|
||||
src2, i_height );
|
||||
if( weight->weightfn )
|
||||
weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
|
||||
}
|
||||
else if( weight->weightfn )
|
||||
weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
|
||||
else
|
||||
mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
|
||||
}
|
||||
|
||||
static pixel *get_ref_neon( pixel *dst, intptr_t *i_dst_stride,
|
||||
pixel *src[4], intptr_t i_src_stride,
|
||||
int mvx, int mvy,
|
||||
int i_width, int i_height, const x264_weight_t *weight )
|
||||
{
|
||||
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
|
||||
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
|
||||
pixel *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
|
||||
if( (mvy&3) == 3 ) // explicit if() to force conditional add
|
||||
src1 += i_src_stride;
|
||||
|
||||
if( qpel_idx & 5 ) /* qpel interpolation needed */
|
||||
{
|
||||
pixel *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
|
||||
pixel_avg_wtab_neon[i_width>>2](
|
||||
dst, *i_dst_stride, src1, i_src_stride,
|
||||
src2, i_height );
|
||||
if( weight->weightfn )
|
||||
weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
|
||||
return dst;
|
||||
}
|
||||
else if( weight->weightfn )
|
||||
{
|
||||
weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
|
||||
return dst;
|
||||
}
|
||||
else
|
||||
{
|
||||
*i_dst_stride = i_src_stride;
|
||||
return src1;
|
||||
}
|
||||
}
|
||||
|
||||
#define x264_hpel_filter_neon x264_template(hpel_filter_neon)
|
||||
void x264_hpel_filter_neon( pixel *dsth, pixel *dstv, pixel *dstc,
|
||||
pixel *src, intptr_t stride, int width,
|
||||
int height, int16_t *buf );
|
||||
|
||||
|
||||
#if !HIGH_BIT_DEPTH && HAVE_I8MM
|
||||
#define x264_hpel_filter_neon_i8mm x264_template(hpel_filter_neon_i8mm)
|
||||
void x264_hpel_filter_neon_i8mm( pixel *dsth, pixel *dstv, pixel *dstc,
|
||||
pixel *src, intptr_t stride, int width,
|
||||
int height, int16_t *buf );
|
||||
#endif // !HIGH_BIT_DEPTH && HAVE_I8MM
|
||||
|
||||
PLANE_COPY(16, neon)
|
||||
PLANE_COPY_SWAP(16, neon)
|
||||
PLANE_INTERLEAVE(neon)
|
||||
PROPAGATE_LIST(neon)
|
||||
|
||||
void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf )
|
||||
{
|
||||
|
||||
if( cpu&X264_CPU_ARMV8 )
|
||||
{
|
||||
pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64;
|
||||
pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64;
|
||||
pf->prefetch_ref = x264_prefetch_ref_aarch64;
|
||||
}
|
||||
|
||||
if( cpu&X264_CPU_NEON )
|
||||
{
|
||||
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
|
||||
pf->mbtree_propagate_list = mbtree_propagate_list_neon;
|
||||
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
|
||||
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
|
||||
|
||||
pf->memcpy_aligned = x264_memcpy_aligned_neon;
|
||||
pf->memzero_aligned = x264_memzero_aligned_neon;
|
||||
|
||||
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
|
||||
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
|
||||
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
|
||||
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
|
||||
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
|
||||
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
|
||||
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
|
||||
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
|
||||
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
|
||||
|
||||
pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
|
||||
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon;
|
||||
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
|
||||
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
|
||||
|
||||
pf->weight = mc_wtab_neon;
|
||||
pf->offsetadd = mc_offsetadd_wtab_neon;
|
||||
pf->offsetsub = mc_offsetsub_wtab_neon;
|
||||
pf->weight_cache = weight_cache_neon;
|
||||
|
||||
pf->mc_chroma = x264_mc_chroma_neon;
|
||||
pf->mc_luma = mc_luma_neon;
|
||||
pf->get_ref = get_ref_neon;
|
||||
|
||||
pf->integral_init4h = x264_integral_init4h_neon;
|
||||
pf->integral_init8h = x264_integral_init8h_neon;
|
||||
pf->integral_init4v = x264_integral_init4v_neon;
|
||||
pf->integral_init8v = x264_integral_init8v_neon;
|
||||
|
||||
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
|
||||
|
||||
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
|
||||
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
|
||||
|
||||
pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
|
||||
|
||||
pf->plane_copy = plane_copy_neon;
|
||||
pf->plane_copy_swap = plane_copy_swap_neon;
|
||||
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
|
||||
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
|
||||
pf->plane_copy_interleave = plane_copy_interleave_neon;
|
||||
|
||||
pf->hpel_filter = x264_hpel_filter_neon;
|
||||
}
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
#if HAVE_SVE
|
||||
if( cpu&X264_CPU_SVE )
|
||||
{
|
||||
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_sve;
|
||||
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_sve;
|
||||
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sve;
|
||||
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sve;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_I8MM
|
||||
if( cpu&X264_CPU_I8MM )
|
||||
{
|
||||
pf->hpel_filter = x264_hpel_filter_neon_i8mm;
|
||||
}
|
||||
#endif // HAVE_I8MM
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
}
|
||||
32
common/aarch64/mc.h
Normal file
32
common/aarch64/mc.h
Normal file
@@ -0,0 +1,32 @@
|
||||
/*****************************************************************************
|
||||
* mc.h: aarch64 motion compensation
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2014-2025 x264 project
|
||||
*
|
||||
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_AARCH64_MC_H
|
||||
#define X264_AARCH64_MC_H
|
||||
|
||||
#define x264_mc_init_aarch64 x264_template(mc_init_aarch64)
|
||||
void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf );
|
||||
|
||||
#endif
|
||||
44
common/aarch64/pixel-a-common.S
Normal file
44
common/aarch64/pixel-a-common.S
Normal file
@@ -0,0 +1,44 @@
|
||||
/****************************************************************************
|
||||
* pixel-a-common.S: aarch64 pixel metrics
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
* David Chen <david.chen@myais.com.cn>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
// This file contains the NEON macros and constants that are intended to be used by
|
||||
// the SVE/SVE2 functions as well
|
||||
|
||||
const mask_ac_4_8
|
||||
.short 0, -1, -1, -1, 0, -1, -1, -1
|
||||
.short 0, -1, -1, -1, -1, -1, -1, -1
|
||||
endconst
|
||||
|
||||
.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
|
||||
SUMSUB_AB \s1, \d1, \a, \b
|
||||
SUMSUB_AB \s2, \d2, \c, \d
|
||||
.endm
|
||||
|
||||
.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
|
||||
SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
|
||||
SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
|
||||
.endm
|
||||
523
common/aarch64/pixel-a-sve.S
Normal file
523
common/aarch64/pixel-a-sve.S
Normal file
@@ -0,0 +1,523 @@
|
||||
/*****************************************************************************
|
||||
* pixel-a-sve.S: aarch64 pixel metrics
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Chen <david.chen@myais.com.cn>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
#include "pixel-a-common.S"
|
||||
|
||||
ENABLE_SVE
|
||||
|
||||
#if BIT_DEPTH == 8
|
||||
|
||||
.macro SSD_START_SVE_4
|
||||
ptrue p0.h, vl4
|
||||
ld1b {z16.h}, p0/z, [x0]
|
||||
ld1b {z17.h}, p0/z, [x2]
|
||||
add x0, x0, x1
|
||||
add x2, x2, x3
|
||||
sub v2.4h, v16.4h, v17.4h
|
||||
ld1b {z16.h}, p0/z, [x0]
|
||||
ld1b {z17.h}, p0/z, [x2]
|
||||
add x0, x0, x1
|
||||
add x2, x2, x3
|
||||
smull v0.4s, v2.4h, v2.4h
|
||||
.endm
|
||||
|
||||
.macro SSD_SVE_4
|
||||
sub v2.4h, v16.4h, v17.4h
|
||||
ld1b {z16.h}, p0/z, [x0]
|
||||
ld1b {z17.h}, p0/z, [x2]
|
||||
add x0, x0, x1
|
||||
add x2, x2, x3
|
||||
smlal v0.4s, v2.4h, v2.4h
|
||||
.endm
|
||||
|
||||
.macro SSD_END_SVE_4
|
||||
sub v2.4h, v16.4h, v17.4h
|
||||
smlal v0.4s, v2.4h, v2.4h
|
||||
.endm
|
||||
|
||||
.macro SSD_START_SVE_8
|
||||
ptrue p0.h, vl8
|
||||
ld1b {z16.h}, p0/z, [x0]
|
||||
ld1b {z17.h}, p0/z, [x2]
|
||||
add x0, x0, x1
|
||||
add x2, x2, x3
|
||||
sub v2.8h, v16.8h, v17.8h
|
||||
ld1b {z16.h}, p0/z, [x0]
|
||||
smull v0.4s, v2.4h, v2.4h
|
||||
ld1b {z17.h}, p0/z, [x2]
|
||||
smlal2 v0.4s, v2.8h, v2.8h
|
||||
add x0, x0, x1
|
||||
add x2, x2, x3
|
||||
.endm
|
||||
|
||||
.macro SSD_SVE_8
|
||||
sub v2.8h, v16.8h, v17.8h
|
||||
ld1b {z16.h}, p0/z, [x0]
|
||||
smlal v0.4s, v2.4h, v2.4h
|
||||
ld1b {z17.h}, p0/z, [x2]
|
||||
smlal2 v0.4s, v2.8h, v2.8h
|
||||
add x0, x0, x1
|
||||
add x2, x2, x3
|
||||
.endm
|
||||
|
||||
.macro SSD_END_SVE_8
|
||||
sub v2.8h, v16.8h, v17.8h
|
||||
smlal v0.4s, v2.4h, v2.4h
|
||||
smlal2 v0.4s, v2.8h, v2.8h
|
||||
.endm
|
||||
|
||||
.macro SSD_FUNC_SVE w h
|
||||
function pixel_ssd_\w\()x\h\()_sve, export=1
|
||||
SSD_START_SVE_\w
|
||||
.rept \h-2
|
||||
SSD_SVE_\w
|
||||
.endr
|
||||
SSD_END_SVE_\w
|
||||
|
||||
addv s0, v0.4s
|
||||
mov w0, v0.s[0]
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro load_diff_fly_sve_8x8
|
||||
ld1b {z1.h}, p0/z, [x2]
|
||||
ld1b {z0.h}, p0/z, [x0]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
ld1b {z3.h}, p0/z, [x2]
|
||||
ld1b {z2.h}, p0/z, [x0]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
sub v16.8h, v0.8h, v1.8h
|
||||
sub v17.8h, v2.8h, v3.8h
|
||||
ld1b {z5.h}, p0/z, [x2]
|
||||
ld1b {z4.h}, p0/z, [x0]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
ld1b {z7.h}, p0/z, [x2]
|
||||
ld1b {z6.h}, p0/z, [x0]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
sub v18.8h, v4.8h, v5.8h
|
||||
sub v19.8h, v6.8h, v7.8h
|
||||
ld1b {z1.h}, p0/z, [x2]
|
||||
ld1b {z0.h}, p0/z, [x0]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
ld1b {z3.h}, p0/z, [x2]
|
||||
ld1b {z2.h}, p0/z, [x0]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
sub v20.8h, v0.8h, v1.8h
|
||||
sub v21.8h, v2.8h, v3.8h
|
||||
ld1b {z5.h}, p0/z, [x2]
|
||||
ld1b {z4.h}, p0/z, [x0]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
ld1b {z7.h}, p0/z, [x2]
|
||||
ld1b {z6.h}, p0/z, [x0]
|
||||
add x2, x2, x3
|
||||
add x0, x0, x1
|
||||
|
||||
SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
|
||||
SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
|
||||
|
||||
sub v22.8h, v4.8h, v5.8h
|
||||
sub v23.8h, v6.8h, v7.8h
|
||||
.endm
|
||||
|
||||
.macro pixel_var_sve_8 h
|
||||
function pixel_var_8x\h\()_sve, export=1
|
||||
ptrue p0.h, vl8
|
||||
ld1b {z16.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
ld1b {z17.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
mov x2, \h - 4
|
||||
mul v1.8h, v16.8h, v16.8h
|
||||
mul v2.8h, v17.8h, v17.8h
|
||||
add v0.8h, v16.8h, v17.8h
|
||||
ld1b {z18.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
uaddlp v1.4s, v1.8h
|
||||
uaddlp v2.4s, v2.8h
|
||||
ld1b {z19.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
|
||||
1: subs x2, x2, #4
|
||||
add v0.8h, v0.8h, v18.8h
|
||||
mul v24.8h, v18.8h, v18.8h
|
||||
ld1b {z20.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
add v0.8h, v0.8h, v19.8h
|
||||
mul v25.8h, v19.8h, v19.8h
|
||||
uadalp v1.4s, v24.8h
|
||||
ld1b {z21.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
add v0.8h, v0.8h, v20.8h
|
||||
mul v26.8h, v20.8h, v20.8h
|
||||
uadalp v2.4s, v25.8h
|
||||
ld1b {z18.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
add v0.8h, v0.8h, v21.8h
|
||||
mul v27.8h, v21.8h, v21.8h
|
||||
uadalp v1.4s, v26.8h
|
||||
ld1b {z19.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
uadalp v2.4s, v27.8h
|
||||
b.gt 1b
|
||||
|
||||
add v0.8h, v0.8h, v18.8h
|
||||
mul v28.8h, v18.8h, v18.8h
|
||||
add v0.8h, v0.8h, v19.8h
|
||||
mul v29.8h, v19.8h, v19.8h
|
||||
uadalp v1.4s, v28.8h
|
||||
uadalp v2.4s, v29.8h
|
||||
|
||||
b var_end
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
function var_end
|
||||
add v1.4s, v1.4s, v2.4s
|
||||
uaddlv s0, v0.8h
|
||||
uaddlv d1, v1.4s
|
||||
mov w0, v0.s[0]
|
||||
mov x1, v1.d[0]
|
||||
orr x0, x0, x1, lsl #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro SUMSUBL_AB_SVE sum, sub, a, b
|
||||
add \sum, \a, \b
|
||||
sub \sub, \a, \b
|
||||
.endm
|
||||
|
||||
function pixel_sa8d_8x8_sve, export=1
|
||||
ptrue p0.h, vl8
|
||||
mov x4, x30
|
||||
bl pixel_sa8d_8x8_sve
|
||||
add v0.8h, v0.8h, v1.8h
|
||||
uaddlv s0, v0.8h
|
||||
mov w0, v0.s[0]
|
||||
add w0, w0, #1
|
||||
lsr w0, w0, #1
|
||||
ret x4
|
||||
endfunc
|
||||
|
||||
.macro sa8d_satd_sve_8x8 satd=
|
||||
function pixel_sa8d_\satd\()8x8_sve
|
||||
load_diff_fly_sve_8x8
|
||||
|
||||
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
|
||||
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
|
||||
|
||||
HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
|
||||
.ifc \satd, satd_
|
||||
transpose v0.8h, v1.8h, v16.8h, v17.8h
|
||||
transpose v2.8h, v3.8h, v18.8h, v19.8h
|
||||
transpose v4.8h, v5.8h, v20.8h, v21.8h
|
||||
transpose v6.8h, v7.8h, v22.8h, v23.8h
|
||||
|
||||
SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h
|
||||
SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h
|
||||
SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h
|
||||
SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h
|
||||
|
||||
transpose v4.4s, v6.4s, v24.4s, v26.4s
|
||||
transpose v5.4s, v7.4s, v25.4s, v27.4s
|
||||
transpose v24.4s, v26.4s, v0.4s, v2.4s
|
||||
transpose v25.4s, v27.4s, v1.4s, v3.4s
|
||||
|
||||
abs v0.8h, v4.8h
|
||||
abs v1.8h, v5.8h
|
||||
abs v2.8h, v6.8h
|
||||
abs v3.8h, v7.8h
|
||||
abs v4.8h, v24.8h
|
||||
abs v5.8h, v25.8h
|
||||
abs v6.8h, v26.8h
|
||||
abs v7.8h, v27.8h
|
||||
|
||||
umax v0.8h, v0.8h, v2.8h
|
||||
umax v1.8h, v1.8h, v3.8h
|
||||
umax v2.8h, v4.8h, v6.8h
|
||||
umax v3.8h, v5.8h, v7.8h
|
||||
|
||||
add v26.8h, v0.8h, v1.8h
|
||||
add v27.8h, v2.8h, v3.8h
|
||||
.endif
|
||||
|
||||
SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
|
||||
SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
|
||||
SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
|
||||
SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h
|
||||
|
||||
transpose v20.8h, v21.8h, v16.8h, v17.8h
|
||||
transpose v4.8h, v5.8h, v0.8h, v1.8h
|
||||
transpose v22.8h, v23.8h, v18.8h, v19.8h
|
||||
transpose v6.8h, v7.8h, v2.8h, v3.8h
|
||||
|
||||
SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h
|
||||
SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
|
||||
SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
|
||||
SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h
|
||||
|
||||
transpose v20.4s, v22.4s, v2.4s, v0.4s
|
||||
transpose v21.4s, v23.4s, v3.4s, v1.4s
|
||||
transpose v16.4s, v18.4s, v24.4s, v4.4s
|
||||
transpose v17.4s, v19.4s, v25.4s, v5.4s
|
||||
|
||||
SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
|
||||
SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
|
||||
SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
|
||||
SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
|
||||
|
||||
transpose v16.2d, v20.2d, v0.2d, v4.2d
|
||||
transpose v17.2d, v21.2d, v1.2d, v5.2d
|
||||
transpose v18.2d, v22.2d, v2.2d, v6.2d
|
||||
transpose v19.2d, v23.2d, v3.2d, v7.2d
|
||||
|
||||
abs v16.8h, v16.8h
|
||||
abs v20.8h, v20.8h
|
||||
abs v17.8h, v17.8h
|
||||
abs v21.8h, v21.8h
|
||||
abs v18.8h, v18.8h
|
||||
abs v22.8h, v22.8h
|
||||
abs v19.8h, v19.8h
|
||||
abs v23.8h, v23.8h
|
||||
|
||||
umax v16.8h, v16.8h, v20.8h
|
||||
umax v17.8h, v17.8h, v21.8h
|
||||
umax v18.8h, v18.8h, v22.8h
|
||||
umax v19.8h, v19.8h, v23.8h
|
||||
|
||||
add v0.8h, v16.8h, v17.8h
|
||||
add v1.8h, v18.8h, v19.8h
|
||||
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro HADAMARD_AC_SVE w h
|
||||
function pixel_hadamard_ac_\w\()x\h\()_sve, export=1
|
||||
ptrue p0.h, vl8
|
||||
movrel x5, mask_ac_4_8
|
||||
mov x4, x30
|
||||
ld1 {v30.8h,v31.8h}, [x5]
|
||||
movi v28.16b, #0
|
||||
movi v29.16b, #0
|
||||
|
||||
bl hadamard_ac_8x8_sve
|
||||
.if \h > 8
|
||||
bl hadamard_ac_8x8_sve
|
||||
.endif
|
||||
.if \w > 8
|
||||
sub x0, x0, x1, lsl #3
|
||||
add x0, x0, #8
|
||||
bl hadamard_ac_8x8_sve
|
||||
.endif
|
||||
.if \w * \h == 256
|
||||
sub x0, x0, x1, lsl #4
|
||||
bl hadamard_ac_8x8_sve
|
||||
.endif
|
||||
|
||||
addv s1, v29.4s
|
||||
addv s0, v28.4s
|
||||
mov w1, v1.s[0]
|
||||
mov w0, v0.s[0]
|
||||
lsr w1, w1, #2
|
||||
lsr w0, w0, #1
|
||||
orr x0, x0, x1, lsl #32
|
||||
ret x4
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
// v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8
|
||||
function hadamard_ac_8x8_sve
|
||||
ld1b {z16.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
ld1b {z17.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
ld1b {z18.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
ld1b {z19.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
SUMSUBL_AB_SVE v0.8h, v1.8h, v16.8h, v17.8h
|
||||
ld1b {z20.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
ld1b {z21.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
SUMSUBL_AB_SVE v2.8h, v3.8h, v18.8h, v19.8h
|
||||
ld1b {z22.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
ld1b {z23.h}, p0/z, [x0]
|
||||
add x0, x0, x1
|
||||
SUMSUBL_AB_SVE v4.8h, v5.8h, v20.8h, v21.8h
|
||||
SUMSUBL_AB_SVE v6.8h, v7.8h, v22.8h, v23.8h
|
||||
|
||||
SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
|
||||
SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
|
||||
|
||||
transpose v0.8h, v1.8h, v16.8h, v17.8h
|
||||
transpose v2.8h, v3.8h, v18.8h, v19.8h
|
||||
transpose v4.8h, v5.8h, v20.8h, v21.8h
|
||||
transpose v6.8h, v7.8h, v22.8h, v23.8h
|
||||
|
||||
SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
|
||||
SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
|
||||
SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
|
||||
SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
|
||||
|
||||
transpose v0.4s, v2.4s, v16.4s, v18.4s
|
||||
transpose v1.4s, v3.4s, v17.4s, v19.4s
|
||||
transpose v4.4s, v6.4s, v20.4s, v22.4s
|
||||
transpose v5.4s, v7.4s, v21.4s, v23.4s
|
||||
|
||||
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
|
||||
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
|
||||
SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
|
||||
|
||||
abs v0.8h, v16.8h
|
||||
abs v4.8h, v20.8h
|
||||
abs v1.8h, v17.8h
|
||||
abs v5.8h, v21.8h
|
||||
abs v2.8h, v18.8h
|
||||
abs v6.8h, v22.8h
|
||||
abs v3.8h, v19.8h
|
||||
abs v7.8h, v23.8h
|
||||
|
||||
add v0.8h, v0.8h, v4.8h
|
||||
add v1.8h, v1.8h, v5.8h
|
||||
and v0.16b, v0.16b, v30.16b
|
||||
add v2.8h, v2.8h, v6.8h
|
||||
add v3.8h, v3.8h, v7.8h
|
||||
add v0.8h, v0.8h, v2.8h
|
||||
add v1.8h, v1.8h, v3.8h
|
||||
uadalp v28.4s, v0.8h
|
||||
uadalp v28.4s, v1.8h
|
||||
|
||||
SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h
|
||||
SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h
|
||||
SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h
|
||||
SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h
|
||||
|
||||
transpose v16.2d, v17.2d, v6.2d, v7.2d
|
||||
transpose v18.2d, v19.2d, v4.2d, v5.2d
|
||||
transpose v20.2d, v21.2d, v2.2d, v3.2d
|
||||
|
||||
abs v16.8h, v16.8h
|
||||
abs v17.8h, v17.8h
|
||||
abs v18.8h, v18.8h
|
||||
abs v19.8h, v19.8h
|
||||
abs v20.8h, v20.8h
|
||||
abs v21.8h, v21.8h
|
||||
|
||||
transpose v7.2d, v6.2d, v1.2d, v0.2d
|
||||
|
||||
umax v3.8h, v16.8h, v17.8h
|
||||
umax v2.8h, v18.8h, v19.8h
|
||||
umax v1.8h, v20.8h, v21.8h
|
||||
|
||||
SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h
|
||||
|
||||
add v2.8h, v2.8h, v3.8h
|
||||
add v2.8h, v2.8h, v1.8h
|
||||
and v4.16b, v4.16b, v31.16b
|
||||
add v2.8h, v2.8h, v2.8h
|
||||
abs v5.8h, v5.8h
|
||||
abs v4.8h, v4.8h
|
||||
add v2.8h, v2.8h, v5.8h
|
||||
add v2.8h, v2.8h, v4.8h
|
||||
uadalp v29.4s, v2.8h
|
||||
ret
|
||||
endfunc
|
||||
|
||||
SSD_FUNC_SVE 4, 4
|
||||
SSD_FUNC_SVE 4, 8
|
||||
SSD_FUNC_SVE 4, 16
|
||||
SSD_FUNC_SVE 8, 4
|
||||
SSD_FUNC_SVE 8, 8
|
||||
|
||||
pixel_var_sve_8 8
|
||||
pixel_var_sve_8 16
|
||||
|
||||
sa8d_satd_sve_8x8
|
||||
|
||||
HADAMARD_AC_SVE 8, 8
|
||||
HADAMARD_AC_SVE 8, 16
|
||||
HADAMARD_AC_SVE 16, 8
|
||||
HADAMARD_AC_SVE 16, 16
|
||||
|
||||
#else /* BIT_DEPTH == 10 */
|
||||
|
||||
.macro SSD_START_SVE_4
|
||||
ptrue p0.s, vl4
|
||||
ld1h {z16.s}, p0/z, [x0]
|
||||
ld1h {z17.s}, p0/z, [x2]
|
||||
add x0, x0, x1, lsl #1
|
||||
add x2, x2, x3, lsl #1
|
||||
sub v2.4s, v16.4s, v17.4s
|
||||
ld1h {z16.s}, p0/z, [x0]
|
||||
ld1h {z17.s}, p0/z, [x2]
|
||||
add x0, x0, x1, lsl #1
|
||||
add x2, x2, x3, lsl #1
|
||||
mul v0.4s, v2.4s, v2.4s
|
||||
.endm
|
||||
|
||||
.macro SSD_SVE_4
|
||||
sub v2.4s, v16.4s, v17.4s
|
||||
ld1h {z16.s}, p0/z, [x0]
|
||||
ld1h {z17.s}, p0/z, [x2]
|
||||
add x0, x0, x1, lsl #1
|
||||
add x2, x2, x3, lsl #1
|
||||
mla v0.4s, v2.4s, v2.4s
|
||||
.endm
|
||||
|
||||
.macro SSD_END_SVE_4
|
||||
sub v2.4s, v16.4s, v17.4s
|
||||
mla v0.4s, v2.4s, v2.4s
|
||||
.endm
|
||||
|
||||
.macro SSD_FUNC_SVE w h
|
||||
function pixel_ssd_\w\()x\h\()_sve, export=1
|
||||
SSD_START_SVE_\w
|
||||
.rept \h-2
|
||||
SSD_SVE_\w
|
||||
.endr
|
||||
SSD_END_SVE_\w
|
||||
|
||||
addv s0, v0.4s
|
||||
fmov w0, s0
|
||||
ret
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
SSD_FUNC_SVE 4, 4
|
||||
SSD_FUNC_SVE 4, 8
|
||||
SSD_FUNC_SVE 4, 16
|
||||
|
||||
#endif /* BIT_DEPTH == 8 */
|
||||
3040
common/aarch64/pixel-a.S
Normal file
3040
common/aarch64/pixel-a.S
Normal file
File diff suppressed because it is too large
Load Diff
191
common/aarch64/pixel.h
Normal file
191
common/aarch64/pixel.h
Normal file
@@ -0,0 +1,191 @@
|
||||
/*****************************************************************************
|
||||
* pixel.h: aarch64 pixel metrics
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_AARCH64_PIXEL_H
|
||||
#define X264_AARCH64_PIXEL_H
|
||||
|
||||
#define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
|
||||
#define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
|
||||
#define x264_pixel_sad_4x16_neon x264_template(pixel_sad_4x16_neon)
|
||||
#define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
|
||||
#define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
|
||||
#define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
|
||||
#define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
|
||||
#define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
|
||||
|
||||
#define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
|
||||
#define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
|
||||
#define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
|
||||
#define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
|
||||
#define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
|
||||
#define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
|
||||
#define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
|
||||
#define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
|
||||
#define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
|
||||
#define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
|
||||
#define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
|
||||
#define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
|
||||
#define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
|
||||
#define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
|
||||
#define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
|
||||
#define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
|
||||
#define x264_pixel_satd_4x16_neon x264_template(pixel_satd_4x16_neon)
|
||||
#define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
|
||||
#define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
|
||||
#define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
|
||||
#define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
|
||||
#define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
|
||||
#define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
|
||||
#define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
|
||||
#define x264_pixel_ssd_4x16_neon x264_template(pixel_ssd_4x16_neon)
|
||||
#define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
|
||||
#define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
|
||||
#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
|
||||
#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
|
||||
#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
|
||||
|
||||
#if HAVE_DOTPROD
|
||||
#define x264_pixel_sad_16x8_neon_dotprod x264_template(pixel_sad_16x8_neon_dotprod)
|
||||
#define x264_pixel_sad_16x16_neon_dotprod x264_template(pixel_sad_16x16_neon_dotprod)
|
||||
#define x264_pixel_sad_x3_16x16_neon_dotprod x264_template(pixel_sad_x3_16x16_neon_dotprod)
|
||||
#define x264_pixel_sad_x3_16x8_neon_dotprod x264_template(pixel_sad_x3_16x8_neon_dotprod)
|
||||
#define x264_pixel_sad_x4_16x16_neon_dotprod x264_template(pixel_sad_x4_16x16_neon_dotprod)
|
||||
#define x264_pixel_sad_x4_16x8_neon_dotprod x264_template(pixel_sad_x4_16x8_neon_dotprod)
|
||||
|
||||
#define x264_pixel_ssd_16x16_neon_dotprod x264_template(pixel_ssd_16x16_neon_dotprod)
|
||||
#define x264_pixel_ssd_16x8_neon_dotprod x264_template(pixel_ssd_16x8_neon_dotprod)
|
||||
#define x264_pixel_ssd_8x16_neon_dotprod x264_template(pixel_ssd_8x16_neon_dotprod)
|
||||
#define x264_pixel_ssd_8x4_neon_dotprod x264_template(pixel_ssd_8x4_neon_dotprod)
|
||||
#define x264_pixel_ssd_8x8_neon_dotprod x264_template(pixel_ssd_8x8_neon_dotprod)
|
||||
#endif // HAVE_DOTPROD
|
||||
|
||||
#define x264_pixel_ssd_4x16_sve x264_template(pixel_ssd_4x16_sve)
|
||||
#define x264_pixel_ssd_4x4_sve x264_template(pixel_ssd_4x4_sve)
|
||||
#define x264_pixel_ssd_4x8_sve x264_template(pixel_ssd_4x8_sve)
|
||||
#define x264_pixel_ssd_8x4_sve x264_template(pixel_ssd_8x4_sve)
|
||||
#define x264_pixel_ssd_8x8_sve x264_template(pixel_ssd_8x8_sve)
|
||||
#define DECL_PIXELS( ret, name, suffix, args ) \
|
||||
ret x264_pixel_##name##_16x16_##suffix args;\
|
||||
ret x264_pixel_##name##_16x8_##suffix args;\
|
||||
ret x264_pixel_##name##_8x16_##suffix args;\
|
||||
ret x264_pixel_##name##_8x8_##suffix args;\
|
||||
ret x264_pixel_##name##_8x4_##suffix args;\
|
||||
ret x264_pixel_##name##_4x16_##suffix args;\
|
||||
ret x264_pixel_##name##_4x8_##suffix args;\
|
||||
ret x264_pixel_##name##_4x4_##suffix args;
|
||||
#define DECL_PIXELS_SSD_SVE( ret, args ) \
|
||||
ret x264_pixel_ssd_8x8_sve args;\
|
||||
ret x264_pixel_ssd_8x4_sve args;\
|
||||
ret x264_pixel_ssd_4x16_sve args;\
|
||||
ret x264_pixel_ssd_4x8_sve args;\
|
||||
ret x264_pixel_ssd_4x4_sve args;
|
||||
|
||||
#define DECL_X1( name, suffix ) \
|
||||
DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
|
||||
#define DECL_X1_SSD_SVE( ) \
|
||||
DECL_PIXELS_SSD_SVE( int, ( pixel *, intptr_t, pixel *, intptr_t ) )
|
||||
|
||||
#define DECL_X4( name, suffix ) \
|
||||
DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
|
||||
DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
|
||||
|
||||
DECL_X1( sad, neon )
|
||||
DECL_X4( sad, neon )
|
||||
DECL_X1( satd, neon )
|
||||
DECL_X1( ssd, neon )
|
||||
DECL_X1_SSD_SVE( )
|
||||
|
||||
#if HAVE_DOTPROD
|
||||
DECL_X1( sad, neon_dotprod )
|
||||
DECL_X4( sad, neon_dotprod )
|
||||
DECL_X1( ssd, neon_dotprod )
|
||||
#endif // HAVE_DOTPROD
|
||||
|
||||
#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
|
||||
void x264_pixel_ssd_nv12_core_neon( pixel *, intptr_t, pixel *, intptr_t, int, int, uint64_t *, uint64_t * );
|
||||
|
||||
#define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
|
||||
int x264_pixel_vsad_neon( pixel *, intptr_t, int );
|
||||
|
||||
#if HAVE_DOTPROD
|
||||
#define x264_pixel_vsad_neon_dotprod x264_template(pixel_vsad_neon_dotprod)
|
||||
int x264_pixel_vsad_neon_dotprod( pixel *, intptr_t, int );
|
||||
#endif // HAVE_DOTPROD
|
||||
|
||||
#define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
|
||||
int x264_pixel_sa8d_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t );
|
||||
#define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
|
||||
int x264_pixel_sa8d_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
|
||||
#define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
|
||||
uint64_t x264_pixel_sa8d_satd_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
|
||||
#define x264_pixel_sa8d_8x8_sve x264_template(pixel_sa8d_8x8_sve)
|
||||
int x264_pixel_sa8d_8x8_sve ( pixel *, intptr_t, pixel *, intptr_t );
|
||||
|
||||
#define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
|
||||
uint64_t x264_pixel_var_8x8_neon ( pixel *, intptr_t );
|
||||
#define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
|
||||
uint64_t x264_pixel_var_8x16_neon ( pixel *, intptr_t );
|
||||
#define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
|
||||
uint64_t x264_pixel_var_16x16_neon( pixel *, intptr_t );
|
||||
#define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
|
||||
int x264_pixel_var2_8x8_neon ( pixel *, pixel *, int * );
|
||||
#define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
|
||||
int x264_pixel_var2_8x16_neon( pixel *, pixel *, int * );
|
||||
#define x264_pixel_var_8x8_sve x264_template(pixel_var_8x8_sve)
|
||||
uint64_t x264_pixel_var_8x8_sve ( pixel *, intptr_t );
|
||||
#define x264_pixel_var_8x16_sve x264_template(pixel_var_8x16_sve)
|
||||
uint64_t x264_pixel_var_8x16_sve ( pixel *, intptr_t );
|
||||
|
||||
|
||||
#define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
|
||||
uint64_t x264_pixel_hadamard_ac_8x8_neon ( pixel *, intptr_t );
|
||||
#define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
|
||||
uint64_t x264_pixel_hadamard_ac_8x16_neon ( pixel *, intptr_t );
|
||||
#define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
|
||||
uint64_t x264_pixel_hadamard_ac_16x8_neon ( pixel *, intptr_t );
|
||||
#define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
|
||||
uint64_t x264_pixel_hadamard_ac_16x16_neon( pixel *, intptr_t );
|
||||
#define x264_pixel_hadamard_ac_8x8_sve x264_template(pixel_hadamard_ac_8x8_sve)
|
||||
uint64_t x264_pixel_hadamard_ac_8x8_sve ( pixel *, intptr_t );
|
||||
#define x264_pixel_hadamard_ac_8x16_sve x264_template(pixel_hadamard_ac_8x16_sve)
|
||||
uint64_t x264_pixel_hadamard_ac_8x16_sve ( pixel *, intptr_t );
|
||||
#define x264_pixel_hadamard_ac_16x8_sve x264_template(pixel_hadamard_ac_16x8_sve)
|
||||
uint64_t x264_pixel_hadamard_ac_16x8_sve ( pixel *, intptr_t );
|
||||
#define x264_pixel_hadamard_ac_16x16_sve x264_template(pixel_hadamard_ac_16x16_sve)
|
||||
uint64_t x264_pixel_hadamard_ac_16x16_sve( pixel *, intptr_t );
|
||||
|
||||
|
||||
#define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
|
||||
void x264_pixel_ssim_4x4x2_core_neon( const pixel *, intptr_t,
|
||||
const pixel *, intptr_t,
|
||||
int sums[2][4] );
|
||||
#define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
|
||||
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
|
||||
|
||||
#define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
|
||||
int x264_pixel_asd8_neon( pixel *, intptr_t, pixel *, intptr_t, int );
|
||||
|
||||
#endif
|
||||
908
common/aarch64/predict-a.S
Normal file
908
common/aarch64/predict-a.S
Normal file
@@ -0,0 +1,908 @@
|
||||
/*****************************************************************************
|
||||
* predict.S: aarch64 intra prediction
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Mans Rullgard <mans@mansr.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
|
||||
const p8weight, align=4
|
||||
.short 1, 2, 3, 4, 1, 2, 3, 4
|
||||
endconst
|
||||
const p16weight, align=4
|
||||
.short 1, 2, 3, 4, 5, 6, 7, 8
|
||||
endconst
|
||||
|
||||
.macro ldcol.8 vd, xn, xm, n=8, hi=0
|
||||
.if \n == 8 || \hi == 0
|
||||
ld1 {\vd\().b}[0], [\xn], \xm
|
||||
ld1 {\vd\().b}[1], [\xn], \xm
|
||||
ld1 {\vd\().b}[2], [\xn], \xm
|
||||
ld1 {\vd\().b}[3], [\xn], \xm
|
||||
.endif
|
||||
.if \n == 8 || \hi == 1
|
||||
ld1 {\vd\().b}[4], [\xn], \xm
|
||||
ld1 {\vd\().b}[5], [\xn], \xm
|
||||
ld1 {\vd\().b}[6], [\xn], \xm
|
||||
ld1 {\vd\().b}[7], [\xn], \xm
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro ldcol.16 vd, xn, xm
|
||||
ldcol.8 \vd, \xn, \xm
|
||||
ld1 {\vd\().b}[ 8], [\xn], \xm
|
||||
ld1 {\vd\().b}[ 9], [\xn], \xm
|
||||
ld1 {\vd\().b}[10], [\xn], \xm
|
||||
ld1 {\vd\().b}[11], [\xn], \xm
|
||||
ld1 {\vd\().b}[12], [\xn], \xm
|
||||
ld1 {\vd\().b}[13], [\xn], \xm
|
||||
ld1 {\vd\().b}[14], [\xn], \xm
|
||||
ld1 {\vd\().b}[15], [\xn], \xm
|
||||
.endm
|
||||
|
||||
|
||||
function predict_4x4_h_aarch64, export=1
|
||||
ldurb w1, [x0, #0*FDEC_STRIDE-1]
|
||||
mov w5, #0x01010101
|
||||
ldrb w2, [x0, #1*FDEC_STRIDE-1]
|
||||
ldrb w3, [x0, #2*FDEC_STRIDE-1]
|
||||
mul w1, w1, w5
|
||||
ldrb w4, [x0, #3*FDEC_STRIDE-1]
|
||||
mul w2, w2, w5
|
||||
str w1, [x0, #0*FDEC_STRIDE]
|
||||
mul w3, w3, w5
|
||||
str w2, [x0, #1*FDEC_STRIDE]
|
||||
mul w4, w4, w5
|
||||
str w3, [x0, #2*FDEC_STRIDE]
|
||||
str w4, [x0, #3*FDEC_STRIDE]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_4x4_v_aarch64, export=1
|
||||
ldur w1, [x0, #0 - 1 * FDEC_STRIDE]
|
||||
str w1, [x0, #0 + 0 * FDEC_STRIDE]
|
||||
str w1, [x0, #0 + 1 * FDEC_STRIDE]
|
||||
str w1, [x0, #0 + 2 * FDEC_STRIDE]
|
||||
str w1, [x0, #0 + 3 * FDEC_STRIDE]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_4x4_dc_neon, export=1
|
||||
sub x1, x0, #FDEC_STRIDE
|
||||
ldurb w4, [x0, #-1 + 0 * FDEC_STRIDE]
|
||||
ldrb w5, [x0, #-1 + 1 * FDEC_STRIDE]
|
||||
ldrb w6, [x0, #-1 + 2 * FDEC_STRIDE]
|
||||
ldrb w7, [x0, #-1 + 3 * FDEC_STRIDE]
|
||||
add w4, w4, w5
|
||||
ldr s0, [x1]
|
||||
add w6, w6, w7
|
||||
uaddlv h0, v0.8b
|
||||
add w4, w4, w6
|
||||
dup v0.4h, v0.h[0]
|
||||
dup v1.4h, w4
|
||||
add v0.4h, v0.4h, v1.4h
|
||||
rshrn v0.8b, v0.8h, #3
|
||||
str s0, [x0]
|
||||
str s0, [x0, #1 * FDEC_STRIDE]
|
||||
str s0, [x0, #2 * FDEC_STRIDE]
|
||||
str s0, [x0, #3 * FDEC_STRIDE]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_4x4_dc_top_neon, export=1
|
||||
sub x1, x0, #FDEC_STRIDE
|
||||
ldr s0, [x1]
|
||||
uaddlv h0, v0.8b
|
||||
dup v0.4h, v0.h[0]
|
||||
rshrn v0.8b, v0.8h, #2
|
||||
str s0, [x0]
|
||||
str s0, [x0, #1 * FDEC_STRIDE]
|
||||
str s0, [x0, #2 * FDEC_STRIDE]
|
||||
str s0, [x0, #3 * FDEC_STRIDE]
|
||||
ret
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_4x4_ddr_neon, export=1
|
||||
sub x1, x0, #FDEC_STRIDE+1
|
||||
mov x7, #FDEC_STRIDE
|
||||
ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1
|
||||
ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1
|
||||
ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1
|
||||
ext v0.8b, v1.8b, v0.8b, #7
|
||||
ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1
|
||||
ext v0.8b, v2.8b, v0.8b, #7 // a
|
||||
ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1
|
||||
ext v1.8b, v3.8b, v0.8b, #7 // b
|
||||
ext v2.8b, v4.8b, v1.8b, #7 // c
|
||||
uaddl v0.8h, v0.8b, v1.8b
|
||||
uaddl v1.8h, v1.8b, v2.8b
|
||||
add v0.8h, v0.8h, v1.8h
|
||||
rshrn v0.8b, v0.8h, #2
|
||||
|
||||
ext v3.8b, v0.8b, v0.8b, #3
|
||||
ext v2.8b, v0.8b, v0.8b, #2
|
||||
ext v1.8b, v0.8b, v0.8b, #1
|
||||
|
||||
str s3, [x0], #FDEC_STRIDE
|
||||
str s2, [x0], #FDEC_STRIDE
|
||||
str s1, [x0], #FDEC_STRIDE
|
||||
str s0, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_4x4_ddl_neon, export=1
|
||||
sub x0, x0, #FDEC_STRIDE
|
||||
mov x7, #FDEC_STRIDE
|
||||
ld1 {v0.8b}, [x0], x7
|
||||
dup v3.8b, v0.b[7]
|
||||
ext v1.8b, v0.8b, v0.8b, #1
|
||||
ext v2.8b, v0.8b, v3.8b, #2
|
||||
uhadd v0.8b, v0.8b, v2.8b
|
||||
urhadd v0.8b, v0.8b, v1.8b
|
||||
str s0, [x0], #FDEC_STRIDE
|
||||
ext v1.8b, v0.8b, v0.8b, #1
|
||||
ext v2.8b, v0.8b, v0.8b, #2
|
||||
str s1, [x0], #FDEC_STRIDE
|
||||
ext v3.8b, v0.8b, v0.8b, #3
|
||||
str s2, [x0], #FDEC_STRIDE
|
||||
str s3, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8_dc_neon, export=1
|
||||
mov x7, #FDEC_STRIDE
|
||||
ld1 {v0.16b}, [x1], #16
|
||||
ld1 {v1.8b}, [x1]
|
||||
ext v0.16b, v0.16b, v0.16b, #7
|
||||
uaddlv h1, v1.8b
|
||||
uaddlv h0, v0.8b
|
||||
add v0.8h, v0.8h, v1.8h
|
||||
dup v0.8h, v0.h[0]
|
||||
rshrn v0.8b, v0.8h, #4
|
||||
.rept 8
|
||||
st1 {v0.8b}, [x0], x7
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8_h_neon, export=1
|
||||
mov x7, #FDEC_STRIDE
|
||||
ld1 {v16.16b}, [x1]
|
||||
dup v0.8b, v16.b[14]
|
||||
dup v1.8b, v16.b[13]
|
||||
st1 {v0.8b}, [x0], x7
|
||||
dup v2.8b, v16.b[12]
|
||||
st1 {v1.8b}, [x0], x7
|
||||
dup v3.8b, v16.b[11]
|
||||
st1 {v2.8b}, [x0], x7
|
||||
dup v4.8b, v16.b[10]
|
||||
st1 {v3.8b}, [x0], x7
|
||||
dup v5.8b, v16.b[9]
|
||||
st1 {v4.8b}, [x0], x7
|
||||
dup v6.8b, v16.b[8]
|
||||
st1 {v5.8b}, [x0], x7
|
||||
dup v7.8b, v16.b[7]
|
||||
st1 {v6.8b}, [x0], x7
|
||||
st1 {v7.8b}, [x0], x7
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8_v_neon, export=1
|
||||
add x1, x1, #16
|
||||
mov x7, #FDEC_STRIDE
|
||||
ld1 {v0.8b}, [x1]
|
||||
.rept 8
|
||||
st1 {v0.8b}, [x0], x7
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8_ddl_neon, export=1
|
||||
add x1, x1, #16
|
||||
mov x7, #FDEC_STRIDE
|
||||
ld1 {v0.16b}, [x1]
|
||||
movi v3.16b, #0
|
||||
dup v2.16b, v0.b[15]
|
||||
ext v4.16b, v3.16b, v0.16b, #15
|
||||
ext v2.16b, v0.16b, v2.16b, #1
|
||||
uhadd v4.16b, v4.16b, v2.16b
|
||||
urhadd v0.16b, v0.16b, v4.16b
|
||||
ext v1.16b, v0.16b, v0.16b, #1
|
||||
ext v2.16b, v0.16b, v0.16b, #2
|
||||
st1 {v1.8b}, [x0], x7
|
||||
ext v3.16b, v0.16b, v0.16b, #3
|
||||
st1 {v2.8b}, [x0], x7
|
||||
ext v4.16b, v0.16b, v0.16b, #4
|
||||
st1 {v3.8b}, [x0], x7
|
||||
ext v5.16b, v0.16b, v0.16b, #5
|
||||
st1 {v4.8b}, [x0], x7
|
||||
ext v6.16b, v0.16b, v0.16b, #6
|
||||
st1 {v5.8b}, [x0], x7
|
||||
ext v7.16b, v0.16b, v0.16b, #7
|
||||
st1 {v6.8b}, [x0], x7
|
||||
ext v0.16b, v0.16b, v0.16b, #8
|
||||
st1 {v7.8b}, [x0], x7
|
||||
st1 {v0.8b}, [x0], x7
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8_ddr_neon, export=1
|
||||
ld1 {v0.16b,v1.16b}, [x1]
|
||||
ext v2.16b, v0.16b, v1.16b, #7
|
||||
ext v4.16b, v0.16b, v1.16b, #9
|
||||
ext v3.16b, v0.16b, v1.16b, #8
|
||||
|
||||
uhadd v2.16b, v2.16b, v4.16b
|
||||
urhadd v7.16b, v3.16b, v2.16b
|
||||
|
||||
add x0, x0, #7*FDEC_STRIDE
|
||||
mov x7, #-1*FDEC_STRIDE
|
||||
|
||||
ext v6.16b, v7.16b, v7.16b, #1
|
||||
st1 {v7.8b}, [x0], x7
|
||||
ext v5.16b, v7.16b, v7.16b, #2
|
||||
st1 {v6.8b}, [x0], x7
|
||||
ext v4.16b, v7.16b, v7.16b, #3
|
||||
st1 {v5.8b}, [x0], x7
|
||||
ext v3.16b, v7.16b, v7.16b, #4
|
||||
st1 {v4.8b}, [x0], x7
|
||||
ext v2.16b, v7.16b, v7.16b, #5
|
||||
st1 {v3.8b}, [x0], x7
|
||||
ext v1.16b, v7.16b, v7.16b, #6
|
||||
st1 {v2.8b}, [x0], x7
|
||||
ext v0.16b, v7.16b, v7.16b, #7
|
||||
st1 {v1.8b}, [x0], x7
|
||||
st1 {v0.8b}, [x0], x7
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8_vl_neon, export=1
|
||||
add x1, x1, #16
|
||||
mov x7, #FDEC_STRIDE
|
||||
|
||||
ld1 {v0.16b}, [x1]
|
||||
ext v1.16b, v1.16b, v0.16b, #15
|
||||
ext v2.16b, v0.16b, v2.16b, #1
|
||||
|
||||
uhadd v1.16b, v1.16b, v2.16b
|
||||
urhadd v3.16b, v0.16b, v2.16b
|
||||
|
||||
urhadd v0.16b, v0.16b, v1.16b
|
||||
|
||||
ext v4.16b, v0.16b, v0.16b, #1
|
||||
st1 {v3.8b}, [x0], x7
|
||||
ext v5.16b, v3.16b, v3.16b, #1
|
||||
st1 {v4.8b}, [x0], x7
|
||||
ext v6.16b, v0.16b, v0.16b, #2
|
||||
st1 {v5.8b}, [x0], x7
|
||||
ext v7.16b, v3.16b, v3.16b, #2
|
||||
st1 {v6.8b}, [x0], x7
|
||||
ext v4.16b, v0.16b, v0.16b, #3
|
||||
st1 {v7.8b}, [x0], x7
|
||||
ext v5.16b, v3.16b, v3.16b, #3
|
||||
st1 {v4.8b}, [x0], x7
|
||||
ext v6.16b, v0.16b, v0.16b, #4
|
||||
st1 {v5.8b}, [x0], x7
|
||||
st1 {v6.8b}, [x0], x7
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8_vr_neon, export=1
|
||||
add x1, x1, #8
|
||||
mov x7, #FDEC_STRIDE
|
||||
ld1 {v2.16b}, [x1]
|
||||
|
||||
ext v1.16b, v2.16b, v2.16b, #14
|
||||
ext v0.16b, v2.16b, v2.16b, #15
|
||||
|
||||
uhadd v3.16b, v2.16b, v1.16b
|
||||
urhadd v2.16b, v2.16b, v0.16b
|
||||
urhadd v0.16b, v0.16b, v3.16b
|
||||
|
||||
ext v1.16b, v2.16b, v2.16b, #8
|
||||
uzp1 v2.8b, v0.8b, v0.8b
|
||||
uzp2 v3.8b, v0.8b, v0.8b
|
||||
ext v0.16b, v0.16b, v0.16b, #8
|
||||
|
||||
st1 {v1.8b}, [x0], x7
|
||||
st1 {v0.8b}, [x0], x7
|
||||
ext v4.8b, v3.8b, v1.8b, #7
|
||||
ext v5.8b, v2.8b, v0.8b, #7
|
||||
st1 {v4.8b}, [x0], x7
|
||||
st1 {v5.8b}, [x0], x7
|
||||
ext v6.8b, v3.8b, v1.8b, #6
|
||||
ext v7.8b, v2.8b, v0.8b, #6
|
||||
st1 {v6.8b}, [x0], x7
|
||||
st1 {v7.8b}, [x0], x7
|
||||
ext v1.8b, v3.8b, v1.8b, #5
|
||||
ext v0.8b, v2.8b, v0.8b, #5
|
||||
st1 {v1.8b}, [x0], x7
|
||||
st1 {v0.8b}, [x0], x7
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8_hd_neon, export=1
|
||||
add x1, x1, #7
|
||||
mov x7, #FDEC_STRIDE
|
||||
|
||||
ld1 {v1.16b}, [x1]
|
||||
ext v3.16b, v1.16b, v1.16b, #1
|
||||
ext v2.16b, v1.16b, v1.16b, #2
|
||||
|
||||
urhadd v4.16b, v1.16b, v3.16b
|
||||
|
||||
uhadd v1.16b, v1.16b, v2.16b
|
||||
urhadd v0.16b, v1.16b, v3.16b
|
||||
|
||||
zip1 v16.8b, v4.8b, v0.8b
|
||||
zip2 v17.8b, v4.8b, v0.8b
|
||||
ext v7.16b, v0.16b, v0.16b, #8
|
||||
|
||||
ext v0.8b, v17.8b, v7.8b, #6
|
||||
ext v1.8b, v17.8b, v7.8b, #4
|
||||
st1 {v0.8b}, [x0], x7
|
||||
ext v2.8b, v17.8b, v7.8b, #2
|
||||
st1 {v1.8b}, [x0], x7
|
||||
st1 {v2.8b}, [x0], x7
|
||||
ext v3.8b, v16.8b, v17.8b, #6
|
||||
st1 {v17.8b}, [x0], x7
|
||||
ext v4.8b, v16.8b, v17.8b, #4
|
||||
st1 {v3.8b}, [x0], x7
|
||||
ext v5.8b, v16.8b, v17.8b, #2
|
||||
st1 {v4.8b}, [x0], x7
|
||||
st1 {v5.8b}, [x0], x7
|
||||
st1 {v16.8b}, [x0], x7
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8_hu_neon, export=1
|
||||
add x1, x1, #7
|
||||
mov x7, #FDEC_STRIDE
|
||||
ld1 {v7.8b}, [x1]
|
||||
dup v6.8b, v7.b[0]
|
||||
rev64 v7.8b, v7.8b
|
||||
|
||||
ext v4.8b, v7.8b, v6.8b, #2
|
||||
ext v2.8b, v7.8b, v6.8b, #1
|
||||
|
||||
uhadd v5.8b, v7.8b, v4.8b
|
||||
urhadd v0.8b, v2.8b, v7.8b
|
||||
urhadd v1.8b, v5.8b, v2.8b
|
||||
|
||||
zip1 v16.8b, v0.8b, v1.8b
|
||||
zip2 v17.8b, v0.8b, v1.8b
|
||||
|
||||
dup v18.4h, v17.h[3]
|
||||
|
||||
ext v0.8b, v16.8b, v17.8b, #2
|
||||
ext v1.8b, v16.8b, v17.8b, #4
|
||||
ext v2.8b, v16.8b, v17.8b, #6
|
||||
st1 {v16.8b}, [x0], x7
|
||||
st1 {v0.8b}, [x0], x7
|
||||
st1 {v1.8b}, [x0], x7
|
||||
st1 {v2.8b}, [x0], x7
|
||||
|
||||
ext v4.8b, v17.8b, v18.8b, #2
|
||||
ext v5.8b, v17.8b, v18.8b, #4
|
||||
ext v6.8b, v17.8b, v18.8b, #6
|
||||
st1 {v17.8b}, [x0], x7
|
||||
st1 {v4.8b}, [x0], x7
|
||||
st1 {v5.8b}, [x0], x7
|
||||
st1 {v6.8b}, [x0]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
function predict_8x8c_dc_top_neon, export=1
|
||||
sub x2, x0, #FDEC_STRIDE
|
||||
mov x1, #FDEC_STRIDE
|
||||
ld1 {v0.8b}, [x2]
|
||||
uaddlp v0.4h, v0.8b
|
||||
addp v0.4h, v0.4h, v0.4h
|
||||
rshrn v0.8b, v0.8h, #2
|
||||
dup v3.8b, v0.b[1]
|
||||
dup v2.8b, v0.b[0]
|
||||
transpose v0.2s, v1.2s, v2.2s, v3.2s
|
||||
b pred8x8c_dc_end
|
||||
endfunc
|
||||
|
||||
function predict_8x8c_dc_left_neon, export=1
|
||||
ldurb w2, [x0, #0 * FDEC_STRIDE - 1]
|
||||
ldrb w3, [x0, #1 * FDEC_STRIDE - 1]
|
||||
ldrb w4, [x0, #2 * FDEC_STRIDE - 1]
|
||||
ldrb w5, [x0, #3 * FDEC_STRIDE - 1]
|
||||
mov x1, #FDEC_STRIDE
|
||||
add w2, w2, w3
|
||||
add w3, w4, w5
|
||||
ldrb w6, [x0, #4 * FDEC_STRIDE - 1]
|
||||
ldrb w7, [x0, #5 * FDEC_STRIDE - 1]
|
||||
ldrb w8, [x0, #6 * FDEC_STRIDE - 1]
|
||||
ldrb w9, [x0, #7 * FDEC_STRIDE - 1]
|
||||
add w6, w6, w7
|
||||
add w7, w8, w9
|
||||
add w2, w2, w3
|
||||
add w6, w6, w7
|
||||
dup v0.8h, w2
|
||||
dup v1.8h, w6
|
||||
rshrn v0.8b, v0.8h, #2
|
||||
rshrn v1.8b, v1.8h, #2
|
||||
b pred8x8c_dc_end
|
||||
endfunc
|
||||
|
||||
function predict_8x8c_dc_neon, export=1
|
||||
mov x1, #FDEC_STRIDE
|
||||
sub x2, x0, #FDEC_STRIDE
|
||||
ldurb w10, [x0, #0 * FDEC_STRIDE - 1]
|
||||
ldrb w11, [x0, #1 * FDEC_STRIDE - 1]
|
||||
ldrb w12, [x0, #2 * FDEC_STRIDE - 1]
|
||||
ldrb w13, [x0, #3 * FDEC_STRIDE - 1]
|
||||
add w10, w10, w11
|
||||
ldrb w4, [x0, #4 * FDEC_STRIDE - 1]
|
||||
ldrb w5, [x0, #5 * FDEC_STRIDE - 1]
|
||||
add w12, w12, w13
|
||||
ldrb w6, [x0, #6 * FDEC_STRIDE - 1]
|
||||
ldrb w7, [x0, #7 * FDEC_STRIDE - 1]
|
||||
add w4, w4, w5
|
||||
add w6, w6, w7
|
||||
add w10, w10, w12, lsl #16
|
||||
add w4, w4, w6, lsl #16
|
||||
ld1 {v0.8b}, [x2]
|
||||
add x10, x10, x4, lsl #32
|
||||
uaddlp v0.4h, v0.8b // s0, s1
|
||||
mov v1.d[0], x10 // s2, s3
|
||||
add v3.4h, v0.4h, v1.4h
|
||||
addp v0.4h, v0.4h, v1.4h // s0, s1, s2, s3
|
||||
addp v1.4h, v3.4h, v3.4h // s0+s2, s1+s3, s0+s2, s1+s3
|
||||
uzp2 v0.4h, v0.4h, v0.4h // s1, s3, s1, s3
|
||||
uzp1 v1.2d, v1.2d, v1.2d
|
||||
uzp1 v0.2d, v0.2d, v0.2d
|
||||
rshrn v3.8b, v1.8h, #3
|
||||
rshrn v2.8b, v0.8h, #2
|
||||
uzp1 v0.8b, v3.8b, v2.8b
|
||||
uzp2 v1.8b, v2.8b, v3.8b
|
||||
pred8x8c_dc_end:
|
||||
add x2, x0, #2 * FDEC_STRIDE
|
||||
add x4, x0, #4 * FDEC_STRIDE
|
||||
add x5, x0, #6 * FDEC_STRIDE
|
||||
st1 {v0.8b}, [x0], x1
|
||||
st1 {v0.8b}, [x2], x1
|
||||
st1 {v0.8b}, [x0]
|
||||
st1 {v0.8b}, [x2]
|
||||
st1 {v1.8b}, [x4], x1
|
||||
st1 {v1.8b}, [x5], x1
|
||||
st1 {v1.8b}, [x4]
|
||||
st1 {v1.8b}, [x5]
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8c_h_neon, export=1
|
||||
sub x1, x0, #1
|
||||
mov x7, #FDEC_STRIDE
|
||||
.rept 4
|
||||
ld1r {v0.8b}, [x1], x7
|
||||
ld1r {v1.8b}, [x1], x7
|
||||
st1 {v0.8b}, [x0], x7
|
||||
st1 {v1.8b}, [x0], x7
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8c_v_aarch64, export=1
|
||||
ldur x1, [x0, #-FDEC_STRIDE]
|
||||
.irp c, 0,1,2,3,4,5,6,7
|
||||
str x1, [x0, #\c * FDEC_STRIDE]
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x8c_p_neon, export=1
|
||||
sub x3, x0, #FDEC_STRIDE
|
||||
mov x1, #FDEC_STRIDE
|
||||
add x2, x3, #4
|
||||
sub x3, x3, #1
|
||||
ld1 {v0.s}[0], [x3]
|
||||
ld1 {v2.s}[0], [x2], x1
|
||||
ldcol.8 v0, x3, x1, 4, hi=1
|
||||
add x3, x3, x1
|
||||
ldcol.8 v3, x3, x1, 4
|
||||
movrel x4, p8weight
|
||||
movrel x5, p16weight
|
||||
uaddl v4.8h, v2.8b, v3.8b
|
||||
rev32 v0.8b, v0.8b
|
||||
trn1 v2.2s, v2.2s, v3.2s
|
||||
ld1 {v7.8h}, [x4]
|
||||
usubl v2.8h, v2.8b, v0.8b
|
||||
mul v2.8h, v2.8h, v7.8h
|
||||
ld1 {v0.8h}, [x5]
|
||||
saddlp v2.4s, v2.8h
|
||||
addp v2.4s, v2.4s, v2.4s
|
||||
shl v3.2s, v2.2s, #4
|
||||
add v2.2s, v2.2s, v3.2s
|
||||
rshrn v5.4h, v2.4s, #5 // b, c, x, x
|
||||
addp v2.4h, v5.4h, v5.4h
|
||||
shl v3.4h, v2.4h, #2
|
||||
sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
|
||||
rev64 v4.4h, v4.4h
|
||||
add v4.4h, v4.4h, v0.4h
|
||||
shl v2.4h, v4.4h, #4 // a
|
||||
sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
|
||||
ext v0.16b, v0.16b, v0.16b, #14
|
||||
sub v6.4h, v5.4h, v3.4h
|
||||
mov v0.h[0], wzr
|
||||
mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
|
||||
dup v1.8h, v2.h[0] // pix
|
||||
dup v2.8h, v5.h[1] // c
|
||||
add v1.8h, v1.8h, v0.8h // pix + x*b
|
||||
mov x3, #8
|
||||
1:
|
||||
subs x3, x3, #1
|
||||
sqshrun v0.8b, v1.8h, #5
|
||||
add v1.8h, v1.8h, v2.8h
|
||||
st1 {v0.8b}, [x0], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
.macro loadsum4 wd, t1, t2, t3, x, idx
|
||||
.if \idx == 0
|
||||
ldurb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
|
||||
.else
|
||||
ldrb \wd, [\x, #(\idx + 0) * FDEC_STRIDE - 1]
|
||||
.endif
|
||||
ldrb \t1, [\x, #(\idx + 1) * FDEC_STRIDE - 1]
|
||||
ldrb \t2, [\x, #(\idx + 2) * FDEC_STRIDE - 1]
|
||||
ldrb \t3, [\x, #(\idx + 3) * FDEC_STRIDE - 1]
|
||||
add \wd, \wd, \t1
|
||||
add \t1, \t2, \t3
|
||||
add \wd, \wd, \t1
|
||||
.endm
|
||||
|
||||
function predict_8x16c_h_neon, export=1
|
||||
sub x2, x0, #1
|
||||
add x3, x0, #FDEC_STRIDE - 1
|
||||
mov x7, #2 * FDEC_STRIDE
|
||||
add x1, x0, #FDEC_STRIDE
|
||||
.rept 4
|
||||
ld1r {v0.8b}, [x2], x7
|
||||
ld1r {v1.8b}, [x3], x7
|
||||
ld1r {v2.8b}, [x2], x7
|
||||
ld1r {v3.8b}, [x3], x7
|
||||
st1 {v0.8b}, [x0], x7
|
||||
st1 {v1.8b}, [x1], x7
|
||||
st1 {v2.8b}, [x0], x7
|
||||
st1 {v3.8b}, [x1], x7
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x16c_v_neon, export=1
|
||||
sub x1, x0, #FDEC_STRIDE
|
||||
mov x2, #2 * FDEC_STRIDE
|
||||
ld1 {v0.8b}, [x1], x2
|
||||
.rept 8
|
||||
st1 {v0.8b}, [x0], x2
|
||||
st1 {v0.8b}, [x1], x2
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x16c_p_neon, export=1
|
||||
movrel x4, p16weight
|
||||
ld1 {v17.8h}, [x4]
|
||||
sub x3, x0, #FDEC_STRIDE
|
||||
mov x1, #FDEC_STRIDE
|
||||
add x2, x3, #4
|
||||
sub x3, x3, #1
|
||||
|
||||
ld1 {v0.8b}, [x3]
|
||||
ld1 {v2.8b}, [x2], x1
|
||||
ldcol.8 v1, x3, x1
|
||||
add x3, x3, x1
|
||||
ldcol.8 v3, x3, x1
|
||||
ext v4.8b, v2.8b, v2.8b, #3
|
||||
ext v5.8b, v3.8b, v3.8b, #7
|
||||
rev32 v0.8b, v0.8b
|
||||
rev64 v1.8b, v1.8b
|
||||
|
||||
uaddl v4.8h, v5.8b, v4.8b // a * 1/16
|
||||
|
||||
usubl v2.8h, v2.8b, v0.8b
|
||||
mul v2.8h, v2.8h, v17.8h
|
||||
saddlp v2.4s, v2.8h
|
||||
addp v2.4s, v2.4s, v2.4s // H
|
||||
|
||||
usubl v3.8h, v3.8b, v1.8b
|
||||
mul v3.8h, v3.8h, v17.8h
|
||||
saddlp v3.4s, v3.8h
|
||||
addp v3.4s, v3.4s, v3.4s
|
||||
addp v3.4s, v3.4s, v3.4s // V
|
||||
|
||||
ext v17.16b, v17.16b, v17.16b, #14
|
||||
|
||||
shl v4.4h, v4.4h, #4 // a
|
||||
shl v6.2s, v2.2s, #4 // 16 * H
|
||||
shl v7.2s, v3.2s, #2 // 4 * V
|
||||
add v2.2s, v2.2s, v6.2s // 17 * H
|
||||
add v3.2s, v3.2s, v7.2s // 5 * V
|
||||
rshrn v2.4h, v2.4s, #5 // b
|
||||
rshrn v3.4h, v3.4s, #6 // c
|
||||
|
||||
mov v17.h[0], wzr
|
||||
|
||||
sub v4.4h, v4.4h, v2.4h // a - b
|
||||
shl v6.4h, v2.4h, #1 // 2 * b
|
||||
add v4.4h, v4.4h, v3.4h // a - b + c
|
||||
shl v7.4h, v3.4h, #3 // 8 * c
|
||||
sub v4.4h, v4.4h, v6.4h // a - 3b + c
|
||||
sub v4.4h, v4.4h, v7.4h // a - 3b - 7c
|
||||
|
||||
mul v0.8h, v17.8h, v2.h[0] // 0,1,2,3,4,5,6,7 * b
|
||||
dup v1.8h, v4.h[0] // i00
|
||||
dup v2.8h, v3.h[0] // c
|
||||
add v1.8h, v1.8h, v0.8h // pix + {0..7}*b
|
||||
mov x3, #16
|
||||
1:
|
||||
subs x3, x3, #2
|
||||
sqrshrun v4.8b, v1.8h, #5
|
||||
add v1.8h, v1.8h, v2.8h
|
||||
sqrshrun v5.8b, v1.8h, #5
|
||||
st1 {v4.8b}, [x0], x1
|
||||
add v1.8h, v1.8h, v2.8h
|
||||
st1 {v5.8b}, [x0], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x16c_dc_neon, export=1
|
||||
mov x1, #FDEC_STRIDE
|
||||
sub x10, x0, #FDEC_STRIDE
|
||||
loadsum4 w2, w3, w4, w5, x0, 0
|
||||
ld1 {v6.8b}, [x10]
|
||||
loadsum4 w6, w7, w8, w9, x0, 4
|
||||
uaddlp v6.4h, v6.8b
|
||||
dup v22.8h, w2 // s2
|
||||
dup v23.8h, w6 // s3
|
||||
loadsum4 w2, w3, w4, w5, x0, 8
|
||||
addp v6.4h, v6.4h, v6.4h // s0, s1
|
||||
loadsum4 w6, w7, w8, w9, x0, 12
|
||||
dup v20.8h, v6.h[0] // s0
|
||||
dup v21.8h, v6.h[1] // s1
|
||||
dup v24.8h, w2 // s4
|
||||
dup v25.8h, w6 // s5
|
||||
|
||||
ext v16.16b, v20.16b, v21.16b, #8
|
||||
ext v17.16b, v22.16b, v21.16b, #8
|
||||
ext v1.16b, v23.16b, v21.16b, #8
|
||||
ext v2.16b, v24.16b, v21.16b, #8
|
||||
ext v3.16b, v25.16b, v21.16b, #8
|
||||
|
||||
add v0.8h, v16.8h, v17.8h
|
||||
add v1.8h, v1.8h, v23.8h
|
||||
add v2.8h, v2.8h, v24.8h
|
||||
add v3.8h, v3.8h, v25.8h
|
||||
|
||||
rshrn v0.8b, v0.8h, #3
|
||||
rshrn v1.8b, v1.8h, #3
|
||||
rshrn v2.8b, v2.8h, #3
|
||||
rshrn v3.8b, v3.8h, #3
|
||||
|
||||
add x11, x0, #4 * FDEC_STRIDE
|
||||
add x12, x0, #8 * FDEC_STRIDE
|
||||
add x13, x0, #12 * FDEC_STRIDE
|
||||
.rept 4
|
||||
st1 {v0.8b}, [x0], x1
|
||||
st1 {v1.8b}, [x11], x1
|
||||
st1 {v2.8b}, [x12], x1
|
||||
st1 {v3.8b}, [x13], x1
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x16c_dc_left_neon, export=1
|
||||
mov x1, #FDEC_STRIDE
|
||||
ldurb w2, [x0, # 0 * FDEC_STRIDE - 1]
|
||||
ldrb w3, [x0, # 1 * FDEC_STRIDE - 1]
|
||||
ldrb w4, [x0, # 2 * FDEC_STRIDE - 1]
|
||||
ldrb w5, [x0, # 3 * FDEC_STRIDE - 1]
|
||||
add w2, w2, w3
|
||||
|
||||
ldrb w6, [x0, # 4 * FDEC_STRIDE - 1]
|
||||
add w4, w4, w5
|
||||
ldrb w7, [x0, # 5 * FDEC_STRIDE - 1]
|
||||
add w2, w2, w4
|
||||
ldrb w8, [x0, # 6 * FDEC_STRIDE - 1]
|
||||
ldrb w9, [x0, # 7 * FDEC_STRIDE - 1]
|
||||
dup v0.8h, w2
|
||||
add w6, w6, w7
|
||||
rshrn v0.8b, v0.8h, #2
|
||||
add w8, w8, w9
|
||||
|
||||
ldrb w10, [x0, # 8 * FDEC_STRIDE - 1]
|
||||
ldrb w11, [x0, # 9 * FDEC_STRIDE - 1]
|
||||
add w6, w6, w8
|
||||
ldrb w12, [x0, #10 * FDEC_STRIDE - 1]
|
||||
ldrb w13, [x0, #11 * FDEC_STRIDE - 1]
|
||||
dup v1.8h, w6
|
||||
add w10, w10, w11
|
||||
rshrn v1.8b, v1.8h, #2
|
||||
add w12, w12, w13
|
||||
|
||||
ldrb w2, [x0, #12 * FDEC_STRIDE - 1]
|
||||
ldrb w3, [x0, #13 * FDEC_STRIDE - 1]
|
||||
add w10, w10, w12
|
||||
ldrb w4, [x0, #14 * FDEC_STRIDE - 1]
|
||||
ldrb w5, [x0, #15 * FDEC_STRIDE - 1]
|
||||
dup v2.8h, w10
|
||||
add w2, w2, w3
|
||||
rshrn v2.8b, v2.8h, #2
|
||||
add w4, w4, w5
|
||||
st1 {v0.8b}, [x0], x1
|
||||
st1 {v0.8b}, [x0], x1
|
||||
add w2, w2, w4
|
||||
st1 {v0.8b}, [x0], x1
|
||||
dup v3.8h, w2
|
||||
st1 {v0.8b}, [x0], x1
|
||||
rshrn v3.8b, v3.8h, #2
|
||||
|
||||
.irp idx, 1, 2, 3
|
||||
.rept 4
|
||||
st1 {v\idx\().8b}, [x0], x1
|
||||
.endr
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_8x16c_dc_top_neon, export=1
|
||||
sub x2, x0, #FDEC_STRIDE
|
||||
mov x1, #FDEC_STRIDE
|
||||
ld1 {v0.8b}, [x2]
|
||||
uaddlp v0.4h, v0.8b
|
||||
addp v0.4h, v0.4h, v0.4h
|
||||
rshrn v4.8b, v0.8h, #2
|
||||
dup v0.8b, v4.b[0]
|
||||
dup v1.8b, v4.b[1]
|
||||
ext v0.8b, v0.8b, v1.8b, #4
|
||||
.rept 16
|
||||
st1 {v0.8b}, [x0], x1
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
|
||||
function predict_16x16_dc_top_neon, export=1
|
||||
sub x2, x0, #FDEC_STRIDE
|
||||
mov x1, #FDEC_STRIDE
|
||||
ld1 {v0.16b}, [x2]
|
||||
uaddlv h0, v0.16b
|
||||
rshrn v0.8b, v0.8h, #4
|
||||
dup v0.16b, v0.b[0]
|
||||
b pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function predict_16x16_dc_left_neon, export=1
|
||||
sub x2, x0, #1
|
||||
mov x1, #FDEC_STRIDE
|
||||
ldcol.16 v0, x2, x1
|
||||
uaddlv h0, v0.16b
|
||||
rshrn v0.8b, v0.8h, #4
|
||||
dup v0.16b, v0.b[0]
|
||||
b pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function predict_16x16_dc_neon, export=1
|
||||
sub x3, x0, #FDEC_STRIDE
|
||||
sub x2, x0, #1
|
||||
mov x1, #FDEC_STRIDE
|
||||
ld1 {v0.16b}, [x3]
|
||||
ldcol.16 v1, x2, x1
|
||||
uaddlv h0, v0.16b
|
||||
uaddlv h1, v1.16b
|
||||
add v0.4h, v0.4h, v1.4h
|
||||
rshrn v0.8b, v0.8h, #5
|
||||
dup v0.16b, v0.b[0]
|
||||
pred16x16_dc_end:
|
||||
.rept 16
|
||||
st1 {v0.16b}, [x0], x1
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_16x16_h_neon, export=1
|
||||
sub x1, x0, #1
|
||||
mov x7, #FDEC_STRIDE
|
||||
.rept 8
|
||||
ld1r {v0.16b}, [x1], x7
|
||||
ld1r {v1.16b}, [x1], x7
|
||||
st1 {v0.16b}, [x0], x7
|
||||
st1 {v1.16b}, [x0], x7
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_16x16_v_neon, export=1
|
||||
sub x0, x0, #FDEC_STRIDE
|
||||
mov x7, #FDEC_STRIDE
|
||||
ld1 {v0.16b}, [x0], x7
|
||||
.rept 16
|
||||
st1 {v0.16b}, [x0], x7
|
||||
.endr
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function predict_16x16_p_neon, export=1
|
||||
sub x3, x0, #FDEC_STRIDE
|
||||
mov x1, #FDEC_STRIDE
|
||||
add x2, x3, #8
|
||||
sub x3, x3, #1
|
||||
ld1 {v0.8b}, [x3]
|
||||
ld1 {v2.8b}, [x2], x1
|
||||
ldcol.8 v1, x3, x1
|
||||
add x3, x3, x1
|
||||
ldcol.8 v3, x3, x1
|
||||
rev64 v0.8b, v0.8b
|
||||
rev64 v1.8b, v1.8b
|
||||
movrel x4, p16weight
|
||||
uaddl v4.8h, v2.8b, v3.8b
|
||||
ld1 {v7.8h}, [x4]
|
||||
usubl v2.8h, v2.8b, v0.8b
|
||||
usubl v3.8h, v3.8b, v1.8b
|
||||
mul v2.8h, v2.8h, v7.8h
|
||||
mul v3.8h, v3.8h, v7.8h
|
||||
saddlp v2.4s, v2.8h
|
||||
saddlp v3.4s, v3.8h
|
||||
addp v2.4s, v2.4s, v3.4s
|
||||
addp v2.4s, v2.4s, v2.4s
|
||||
shl v3.2s, v2.2s, #2
|
||||
add v2.2s, v2.2s, v3.2s
|
||||
rshrn v5.4h, v2.4s, #6 // b, c, x, x
|
||||
addp v2.4h, v5.4h, v5.4h
|
||||
shl v3.4h, v2.4h, #3
|
||||
sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
|
||||
ext v4.16b, v4.16b, v4.16b, #14
|
||||
add v4.4h, v4.4h, v7.4h
|
||||
shl v2.4h, v4.4h, #4 // a
|
||||
sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
|
||||
ext v7.16b, v7.16b, v7.16b, #14
|
||||
mov v7.h[0], wzr
|
||||
dup v3.8h, v5.h[0]
|
||||
mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
|
||||
dup v1.8h, v2.h[0] // pix
|
||||
dup v2.8h, v5.h[1] // c
|
||||
shl v3.8h, v3.8h, #3
|
||||
add v1.8h, v1.8h, v0.8h // pix + x*b
|
||||
add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
|
||||
mov x3, #16
|
||||
1:
|
||||
subs x3, x3, #1
|
||||
sqshrun v0.8b, v1.8h, #5
|
||||
add v1.8h, v1.8h, v2.8h
|
||||
sqshrun2 v0.16b, v3.8h, #5
|
||||
add v3.8h, v3.8h, v2.8h
|
||||
st1 {v0.16b}, [x0], x1
|
||||
b.ne 1b
|
||||
ret
|
||||
endfunc
|
||||
116
common/aarch64/predict-c.c
Normal file
116
common/aarch64/predict-c.c
Normal file
@@ -0,0 +1,116 @@
|
||||
/*****************************************************************************
|
||||
* predict.c: aarch64 intra prediction
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common/common.h"
|
||||
#include "predict.h"
|
||||
#include "pixel.h"
|
||||
|
||||
void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] )
|
||||
{
|
||||
#if !HIGH_BIT_DEPTH
|
||||
if( cpu&X264_CPU_ARMV8 )
|
||||
{
|
||||
pf[I_PRED_4x4_H] = x264_predict_4x4_h_aarch64;
|
||||
pf[I_PRED_4x4_V] = x264_predict_4x4_v_aarch64;
|
||||
}
|
||||
|
||||
if( cpu&X264_CPU_NEON )
|
||||
{
|
||||
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_neon;
|
||||
pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
|
||||
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
|
||||
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_neon;
|
||||
}
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
}
|
||||
|
||||
void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
|
||||
{
|
||||
#if !HIGH_BIT_DEPTH
|
||||
if( cpu&X264_CPU_ARMV8 )
|
||||
{
|
||||
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_aarch64;
|
||||
}
|
||||
|
||||
if( !(cpu&X264_CPU_NEON) )
|
||||
return;
|
||||
|
||||
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
|
||||
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
|
||||
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
|
||||
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
}
|
||||
|
||||
|
||||
void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
|
||||
{
|
||||
if( !(cpu&X264_CPU_NEON) )
|
||||
return;
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
pf[I_PRED_CHROMA_V ] = x264_predict_8x16c_v_neon;
|
||||
pf[I_PRED_CHROMA_H ] = x264_predict_8x16c_h_neon;
|
||||
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_neon;
|
||||
pf[I_PRED_CHROMA_P ] = x264_predict_8x16c_p_neon;
|
||||
pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_neon;
|
||||
pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_neon;
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
}
|
||||
|
||||
void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
|
||||
{
|
||||
if( !(cpu&X264_CPU_NEON) )
|
||||
return;
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
|
||||
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
|
||||
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon;
|
||||
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon;
|
||||
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
|
||||
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
|
||||
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon;
|
||||
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon;
|
||||
pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon;
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
}
|
||||
|
||||
void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] )
|
||||
{
|
||||
if( !(cpu&X264_CPU_NEON) )
|
||||
return;
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
|
||||
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
|
||||
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
|
||||
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
|
||||
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
|
||||
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon;
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
}
|
||||
119
common/aarch64/predict.h
Normal file
119
common/aarch64/predict.h
Normal file
@@ -0,0 +1,119 @@
|
||||
/*****************************************************************************
|
||||
* predict.h: aarch64 intra prediction
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_AARCH64_PREDICT_H
|
||||
#define X264_AARCH64_PREDICT_H
|
||||
|
||||
#define x264_predict_4x4_h_aarch64 x264_template(predict_4x4_h_aarch64)
|
||||
void x264_predict_4x4_h_aarch64( uint8_t *src );
|
||||
#define x264_predict_4x4_v_aarch64 x264_template(predict_4x4_v_aarch64)
|
||||
void x264_predict_4x4_v_aarch64( uint8_t *src );
|
||||
#define x264_predict_8x8c_v_aarch64 x264_template(predict_8x8c_v_aarch64)
|
||||
void x264_predict_8x8c_v_aarch64( uint8_t *src );
|
||||
|
||||
// for the merged 4x4 intra sad/satd which expects unified suffix
|
||||
#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
|
||||
#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
|
||||
#define x264_predict_8x8c_v_neon x264_predict_8x8c_v_aarch64
|
||||
|
||||
#define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
|
||||
void x264_predict_4x4_dc_top_neon( uint8_t *src );
|
||||
#define x264_predict_4x4_ddr_neon x264_template(predict_4x4_ddr_neon)
|
||||
void x264_predict_4x4_ddr_neon( uint8_t *src );
|
||||
#define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
|
||||
void x264_predict_4x4_ddl_neon( uint8_t *src );
|
||||
|
||||
#define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
|
||||
void x264_predict_8x8c_dc_top_neon( uint8_t *src );
|
||||
#define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
|
||||
void x264_predict_8x8c_dc_left_neon( uint8_t *src );
|
||||
#define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
|
||||
void x264_predict_8x8c_p_neon( uint8_t *src );
|
||||
|
||||
#define x264_predict_8x16c_dc_left_neon x264_template(predict_8x16c_dc_left_neon)
|
||||
void x264_predict_8x16c_dc_left_neon( uint8_t *src );
|
||||
#define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
|
||||
void x264_predict_8x16c_dc_top_neon( uint8_t *src );
|
||||
#define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
|
||||
void x264_predict_8x16c_p_neon( uint8_t *src );
|
||||
|
||||
#define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
|
||||
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
|
||||
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
|
||||
void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
|
||||
void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
|
||||
void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
|
||||
void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
|
||||
|
||||
#define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
|
||||
void x264_predict_16x16_dc_top_neon( uint8_t *src );
|
||||
#define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
|
||||
void x264_predict_16x16_dc_left_neon( uint8_t *src );
|
||||
#define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
|
||||
void x264_predict_16x16_p_neon( uint8_t *src );
|
||||
|
||||
#define x264_predict_4x4_dc_neon x264_template(predict_4x4_dc_neon)
|
||||
void x264_predict_4x4_dc_neon( uint8_t *src );
|
||||
#define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
|
||||
void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
|
||||
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
|
||||
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
|
||||
void x264_predict_8x8c_dc_neon( uint8_t *src );
|
||||
#define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
|
||||
void x264_predict_8x8c_h_neon( uint8_t *src );
|
||||
#define x264_predict_8x16c_v_neon x264_template(predict_8x16c_v_neon)
|
||||
void x264_predict_8x16c_v_neon( uint8_t *src );
|
||||
#define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
|
||||
void x264_predict_8x16c_h_neon( uint8_t *src );
|
||||
#define x264_predict_8x16c_dc_neon x264_template(predict_8x16c_dc_neon)
|
||||
void x264_predict_8x16c_dc_neon( uint8_t *src );
|
||||
#define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
|
||||
void x264_predict_16x16_v_neon( uint8_t *src );
|
||||
#define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
|
||||
void x264_predict_16x16_h_neon( uint8_t *src );
|
||||
#define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
|
||||
void x264_predict_16x16_dc_neon( uint8_t *src );
|
||||
|
||||
#define x264_predict_4x4_init_aarch64 x264_template(predict_4x4_init_aarch64)
|
||||
void x264_predict_4x4_init_aarch64( uint32_t cpu, x264_predict_t pf[12] );
|
||||
#define x264_predict_8x8_init_aarch64 x264_template(predict_8x8_init_aarch64)
|
||||
void x264_predict_8x8_init_aarch64( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
|
||||
#define x264_predict_8x8c_init_aarch64 x264_template(predict_8x8c_init_aarch64)
|
||||
void x264_predict_8x8c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
|
||||
#define x264_predict_8x16c_init_aarch64 x264_template(predict_8x16c_init_aarch64)
|
||||
void x264_predict_8x16c_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
|
||||
#define x264_predict_16x16_init_aarch64 x264_template(predict_16x16_init_aarch64)
|
||||
void x264_predict_16x16_init_aarch64( uint32_t cpu, x264_predict_t pf[7] );
|
||||
|
||||
#endif /* X264_AARCH64_PREDICT_H */
|
||||
1169
common/aarch64/quant-a.S
Normal file
1169
common/aarch64/quant-a.S
Normal file
File diff suppressed because it is too large
Load Diff
95
common/aarch64/quant.h
Normal file
95
common/aarch64/quant.h
Normal file
@@ -0,0 +1,95 @@
|
||||
/*****************************************************************************
|
||||
* quant.h: arm quantization and level-run
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2005-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_AARCH64_QUANT_H
|
||||
#define X264_AARCH64_QUANT_H
|
||||
|
||||
#define x264_quant_2x2_dc_aarch64 x264_template(quant_2x2_dc_aarch64)
|
||||
int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias );
|
||||
|
||||
#define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
|
||||
int x264_quant_2x2_dc_neon( dctcoef dct[4], int mf, int bias );
|
||||
#define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
|
||||
int x264_quant_4x4_dc_neon( dctcoef dct[16], int mf, int bias );
|
||||
#define x264_quant_4x4_neon x264_template(quant_4x4_neon)
|
||||
int x264_quant_4x4_neon( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
|
||||
#define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
|
||||
int x264_quant_4x4x4_neon( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
|
||||
#define x264_quant_8x8_neon x264_template(quant_8x8_neon)
|
||||
int x264_quant_8x8_neon( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
|
||||
|
||||
#define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
|
||||
void x264_dequant_4x4_dc_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
|
||||
void x264_dequant_4x4_neon( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
|
||||
void x264_dequant_8x8_neon( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
|
||||
|
||||
#define x264_decimate_score15_neon x264_template(decimate_score15_neon)
|
||||
int x264_decimate_score15_neon( dctcoef * );
|
||||
#define x264_decimate_score16_neon x264_template(decimate_score16_neon)
|
||||
int x264_decimate_score16_neon( dctcoef * );
|
||||
#define x264_decimate_score64_neon x264_template(decimate_score64_neon)
|
||||
int x264_decimate_score64_neon( dctcoef * );
|
||||
|
||||
// BIT DEPTH = 8
|
||||
#define x264_coeff_last4_aarch64 x264_template(coeff_last4_aarch64)
|
||||
int x264_coeff_last4_aarch64( dctcoef * );
|
||||
#define x264_coeff_last8_aarch64 x264_template(coeff_last8_aarch64)
|
||||
int x264_coeff_last8_aarch64( dctcoef * );
|
||||
|
||||
// BIT DEPTH = 10
|
||||
#define x264_coeff_last4_neon x264_template(coeff_last4_neon)
|
||||
int x264_coeff_last4_neon( dctcoef * );
|
||||
#define x264_coeff_last8_neon x264_template(coeff_last8_neon)
|
||||
int x264_coeff_last8_neon( dctcoef * );
|
||||
|
||||
#define x264_coeff_last15_neon x264_template(coeff_last15_neon)
|
||||
int x264_coeff_last15_neon( dctcoef * );
|
||||
#define x264_coeff_last16_neon x264_template(coeff_last16_neon)
|
||||
int x264_coeff_last16_neon( dctcoef * );
|
||||
#define x264_coeff_last64_neon x264_template(coeff_last64_neon)
|
||||
int x264_coeff_last64_neon( dctcoef * );
|
||||
|
||||
// BIT_DEPTH = 8
|
||||
#define x264_coeff_level_run4_aarch64 x264_template(coeff_level_run4_aarch64)
|
||||
int x264_coeff_level_run4_aarch64( dctcoef *, x264_run_level_t * );
|
||||
|
||||
// BIT_DEPTH = 10
|
||||
#define x264_coeff_level_run4_neon x264_template(coeff_level_run4_neon)
|
||||
int x264_coeff_level_run4_neon( dctcoef *, x264_run_level_t * );
|
||||
|
||||
#define x264_coeff_level_run8_neon x264_template(coeff_level_run8_neon)
|
||||
int x264_coeff_level_run8_neon( dctcoef *, x264_run_level_t * );
|
||||
#define x264_coeff_level_run15_neon x264_template(coeff_level_run15_neon)
|
||||
int x264_coeff_level_run15_neon( dctcoef *, x264_run_level_t * );
|
||||
#define x264_coeff_level_run16_neon x264_template(coeff_level_run16_neon)
|
||||
int x264_coeff_level_run16_neon( dctcoef *, x264_run_level_t * );
|
||||
|
||||
#define x264_denoise_dct_neon x264_template(denoise_dct_neon)
|
||||
void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user