x264 source for verification 2026-05-22
This commit is contained in:
263
common/arm/asm.S
Normal file
263
common/arm/asm.S
Normal file
@@ -0,0 +1,263 @@
|
||||
/*****************************************************************************
|
||||
* asm.S: arm utility macros
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2008-2025 x264 project
|
||||
*
|
||||
* Authors: Mans Rullgard <mans@mansr.com>
|
||||
* David Conrad <lessen42@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
.syntax unified
|
||||
|
||||
#ifdef __ELF__
|
||||
.arch armv7-a
|
||||
.fpu neon
|
||||
#endif
|
||||
|
||||
#define GLUE(a, b) a ## b
|
||||
#define JOIN(a, b) GLUE(a, b)
|
||||
|
||||
#ifdef PREFIX
|
||||
# define BASE _x264_
|
||||
# define SYM_PREFIX _
|
||||
#else
|
||||
# define BASE x264_
|
||||
# define SYM_PREFIX
|
||||
#endif
|
||||
|
||||
#ifdef BIT_DEPTH
|
||||
# define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
|
||||
#else
|
||||
# define EXTERN_ASM BASE
|
||||
#endif
|
||||
|
||||
#define X(s) JOIN(EXTERN_ASM, s)
|
||||
#define X264(s) JOIN(BASE, s)
|
||||
#define EXT(s) JOIN(SYM_PREFIX, s)
|
||||
|
||||
#ifdef __ELF__
|
||||
# define ELF
|
||||
#else
|
||||
# define ELF @
|
||||
#endif
|
||||
|
||||
#ifdef __MACH__
|
||||
# define MACH
|
||||
# define NONMACH @
|
||||
#else
|
||||
# define MACH @
|
||||
# define NONMACH
|
||||
#endif
|
||||
|
||||
#if HAVE_AS_FUNC
|
||||
# define FUNC
|
||||
#else
|
||||
# define FUNC @
|
||||
#endif
|
||||
|
||||
#if SYS_LINUX || SYS_OPENBSD
|
||||
#define HAVE_SECTION_DATA_REL_RO 1
|
||||
#else
|
||||
#define HAVE_SECTION_DATA_REL_RO 0
|
||||
#endif
|
||||
|
||||
.macro require8, val=1
|
||||
ELF .eabi_attribute 24, \val
|
||||
.endm
|
||||
|
||||
.macro preserve8, val=1
|
||||
ELF .eabi_attribute 25, \val
|
||||
.endm
|
||||
|
||||
.macro function name, export=1
|
||||
.macro endfunc
|
||||
.if \export
|
||||
ELF .size EXTERN_ASM\name, . - EXTERN_ASM\name
|
||||
.else
|
||||
ELF .size \name, . - \name
|
||||
.endif
|
||||
FUNC .endfunc
|
||||
.purgem endfunc
|
||||
.endm
|
||||
.text
|
||||
.align 2
|
||||
.if \export == 1
|
||||
.global EXTERN_ASM\name
|
||||
ELF .hidden EXTERN_ASM\name
|
||||
ELF .type EXTERN_ASM\name, %function
|
||||
FUNC .func EXTERN_ASM\name
|
||||
EXTERN_ASM\name:
|
||||
.else
|
||||
ELF .hidden \name
|
||||
ELF .type \name, %function
|
||||
FUNC .func \name
|
||||
\name:
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro const name, align=2, relocate=0
|
||||
.macro endconst
|
||||
ELF .size \name, . - \name
|
||||
.purgem endconst
|
||||
.endm
|
||||
.if HAVE_SECTION_DATA_REL_RO && \relocate
|
||||
.section .data.rel.ro
|
||||
.else
|
||||
NONMACH .section .rodata
|
||||
MACH .const_data
|
||||
.endif
|
||||
.align \align
|
||||
\name:
|
||||
.endm
|
||||
|
||||
.macro movrel rd, val
|
||||
#if defined(PIC)
|
||||
ldr \rd, 1f
|
||||
b 2f
|
||||
1:
|
||||
@ FIXME: thumb
|
||||
.word \val - (2f + 8)
|
||||
2:
|
||||
add \rd, \rd, pc
|
||||
#elif HAVE_ARMV6T2
|
||||
movw \rd, #:lower16:\val
|
||||
movt \rd, #:upper16:\val
|
||||
#else
|
||||
ldr \rd, =\val
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro movrelx rd, val, got
|
||||
#if defined(PIC) && defined(__ELF__)
|
||||
ldr \got, 2f
|
||||
ldr \rd, 1f
|
||||
b 3f
|
||||
1:
|
||||
@ FIXME: thumb
|
||||
.word \val(GOT)
|
||||
2:
|
||||
.word _GLOBAL_OFFSET_TABLE_ - (3f + 8)
|
||||
3:
|
||||
add \got, \got, pc
|
||||
ldr \rd, [\got, \rd]
|
||||
#elif defined(PIC) && defined(__APPLE__)
|
||||
ldr \rd, 1f
|
||||
b 2f
|
||||
1:
|
||||
@ FIXME: thumb
|
||||
.word 3f - (2f + 8)
|
||||
2:
|
||||
ldr \rd, [pc, \rd]
|
||||
.non_lazy_symbol_pointer
|
||||
3:
|
||||
.indirect_symbol \val
|
||||
.word 0
|
||||
.text
|
||||
#else
|
||||
movrel \rd, \val
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro movconst rd, val
|
||||
#if HAVE_ARMV6T2
|
||||
movw \rd, #:lower16:\val
|
||||
.if \val >> 16
|
||||
movt \rd, #:upper16:\val
|
||||
.endif
|
||||
#else
|
||||
ldr \rd, =\val
|
||||
#endif
|
||||
.endm
|
||||
|
||||
#define FENC_STRIDE 16
|
||||
#define FDEC_STRIDE 32
|
||||
|
||||
.macro HORIZ_ADD dest, a, b
|
||||
.ifnb \b
|
||||
vadd.u16 \a, \a, \b
|
||||
.endif
|
||||
vpaddl.u16 \a, \a
|
||||
vpaddl.u32 \dest, \a
|
||||
.endm
|
||||
|
||||
.macro SUMSUB_AB sum, diff, a, b
|
||||
vadd.s16 \sum, \a, \b
|
||||
vsub.s16 \diff, \a, \b
|
||||
.endm
|
||||
|
||||
.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
|
||||
SUMSUB_AB \s1, \d1, \a, \b
|
||||
SUMSUB_AB \s2, \d2, \c, \d
|
||||
.endm
|
||||
|
||||
.macro ABS2 a b
|
||||
vabs.s16 \a, \a
|
||||
vabs.s16 \b, \b
|
||||
.endm
|
||||
|
||||
// dist = distance in elements (0 for vertical pass, 1/2 for horizontal passes)
|
||||
// op = sumsub/amax (sum and diff / maximum of absolutes)
|
||||
// d1/2 = destination registers
|
||||
// s1/2 = source registers
|
||||
.macro HADAMARD dist, op, d1, d2, s1, s2
|
||||
.if \dist == 1
|
||||
vtrn.16 \s1, \s2
|
||||
.else
|
||||
vtrn.32 \s1, \s2
|
||||
.endif
|
||||
.ifc \op, sumsub
|
||||
SUMSUB_AB \d1, \d2, \s1, \s2
|
||||
.else
|
||||
vabs.s16 \s1, \s1
|
||||
vabs.s16 \s2, \s2
|
||||
vmax.s16 \d1, \s1, \s2
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro TRANSPOSE8x8 r0 r1 r2 r3 r4 r5 r6 r7
|
||||
vtrn.32 \r0, \r4
|
||||
vtrn.32 \r1, \r5
|
||||
vtrn.32 \r2, \r6
|
||||
vtrn.32 \r3, \r7
|
||||
vtrn.16 \r0, \r2
|
||||
vtrn.16 \r1, \r3
|
||||
vtrn.16 \r4, \r6
|
||||
vtrn.16 \r5, \r7
|
||||
vtrn.8 \r0, \r1
|
||||
vtrn.8 \r2, \r3
|
||||
vtrn.8 \r4, \r5
|
||||
vtrn.8 \r6, \r7
|
||||
.endm
|
||||
|
||||
.macro TRANSPOSE4x4 r0 r1 r2 r3
|
||||
vtrn.16 \r0, \r2
|
||||
vtrn.16 \r1, \r3
|
||||
vtrn.8 \r0, \r1
|
||||
vtrn.8 \r2, \r3
|
||||
.endm
|
||||
|
||||
.macro TRANSPOSE4x4_16 d0 d1 d2 d3
|
||||
vtrn.32 \d0, \d2
|
||||
vtrn.32 \d1, \d3
|
||||
vtrn.16 \d0, \d1
|
||||
vtrn.16 \d2, \d3
|
||||
.endm
|
||||
84
common/arm/bitstream-a.S
Normal file
84
common/arm/bitstream-a.S
Normal file
@@ -0,0 +1,84 @@
|
||||
/*****************************************************************************
|
||||
* bitstream-a.S: arm bitstream functions
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2014-2025 x264 project
|
||||
*
|
||||
* Authors: Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
|
||||
function nal_escape_neon
|
||||
push {r4-r5,lr}
|
||||
vmov.u8 q0, #0xff
|
||||
vmov.u8 q8, #4
|
||||
mov r3, #3
|
||||
subs lr, r1, r2
|
||||
beq 99f
|
||||
0:
|
||||
cmn lr, #15
|
||||
blt 16f
|
||||
mov r1, r2
|
||||
b 100f
|
||||
16:
|
||||
vld1.8 {q1}, [r1]!
|
||||
vext.8 q2, q0, q1, #14
|
||||
vext.8 q3, q0, q1, #15
|
||||
vcgt.u8 q11, q8, q1
|
||||
vceq.u8 q9, q2, #0
|
||||
vceq.u8 q10, q3, #0
|
||||
vand q9, q9, q11
|
||||
vand q9, q9, q10
|
||||
vshrn.u16 d22, q9, #4
|
||||
vmov ip, lr, d22
|
||||
orrs ip, ip, lr
|
||||
beq 16f
|
||||
mov lr, #-16
|
||||
100:
|
||||
vmov.u8 r5, d1[6]
|
||||
vmov.u8 r4, d1[7]
|
||||
orr r5, r4, r5, lsl #8
|
||||
101:
|
||||
ldrb r4, [r1, lr]
|
||||
orr ip, r4, r5, lsl #16
|
||||
cmp ip, #3
|
||||
bhi 102f
|
||||
strb r3, [r0], #1
|
||||
orr r5, r3, r5, lsl #8
|
||||
102:
|
||||
adds lr, lr, #1
|
||||
strb r4, [r0], #1
|
||||
orr r5, r4, r5, lsl #8
|
||||
blt 101b
|
||||
subs lr, r1, r2
|
||||
lsr ip, r5, #8
|
||||
vmov.u8 d1[6], ip
|
||||
vmov.u8 d1[7], r5
|
||||
blt 0b
|
||||
|
||||
pop {r4-r5,pc}
|
||||
16:
|
||||
subs lr, r1, r2
|
||||
vst1.8 {q1}, [r0]!
|
||||
vmov q0, q1
|
||||
blt 0b
|
||||
99:
|
||||
pop {r4-r5,pc}
|
||||
endfunc
|
||||
32
common/arm/bitstream.h
Normal file
32
common/arm/bitstream.h
Normal file
@@ -0,0 +1,32 @@
|
||||
/*****************************************************************************
|
||||
* bitstream.h: arm bitstream functions
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2017-2025 x264 project
|
||||
*
|
||||
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_ARM_BITSTREAM_H
|
||||
#define X264_ARM_BITSTREAM_H
|
||||
|
||||
#define x264_nal_escape_neon x264_template(nal_escape_neon)
|
||||
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
|
||||
|
||||
#endif
|
||||
108
common/arm/cpu-a.S
Normal file
108
common/arm/cpu-a.S
Normal file
@@ -0,0 +1,108 @@
|
||||
/*****************************************************************************
|
||||
* cpu-a.S: arm cpu detection
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
|
||||
.align 2
|
||||
|
||||
// done in gas because .fpu neon overrides the refusal to assemble
|
||||
// instructions the selected -march/-mcpu doesn't support
|
||||
function cpu_neon_test
|
||||
vadd.i16 q0, q0, q0
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
// return: 0 on success
|
||||
// 1 if counters were already enabled
|
||||
// 9 if lo-res counters were already enabled
|
||||
function cpu_enable_armv7_counter, export=0
|
||||
mrc p15, 0, r2, c9, c12, 0 // read PMNC
|
||||
ands r0, r2, #1
|
||||
andne r0, r2, #9
|
||||
|
||||
orr r2, r2, #1 // enable counters
|
||||
bic r2, r2, #8 // full resolution
|
||||
mcreq p15, 0, r2, c9, c12, 0 // write PMNC
|
||||
mov r2, #1 << 31 // enable cycle counter
|
||||
mcr p15, 0, r2, c9, c12, 1 // write CNTENS
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function cpu_disable_armv7_counter, export=0
|
||||
mrc p15, 0, r0, c9, c12, 0 // read PMNC
|
||||
bic r0, r0, #1 // disable counters
|
||||
mcr p15, 0, r0, c9, c12, 0 // write PMNC
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
|
||||
.macro READ_TIME r
|
||||
mrc p15, 0, \r, c9, c13, 0
|
||||
.endm
|
||||
|
||||
// return: 0 if transfers neon -> arm transfers take more than 10 cycles
|
||||
// nonzero otherwise
|
||||
function cpu_fast_neon_mrc_test
|
||||
// check for user access to performance counters
|
||||
mrc p15, 0, r0, c9, c14, 0
|
||||
cmp r0, #0
|
||||
bxeq lr
|
||||
|
||||
push {r4-r6,lr}
|
||||
bl cpu_enable_armv7_counter
|
||||
ands r1, r0, #8
|
||||
mov r3, #0
|
||||
mov ip, #4
|
||||
mov r6, #4
|
||||
moveq r5, #1
|
||||
movne r5, #64
|
||||
|
||||
average_loop:
|
||||
mov r4, r5
|
||||
READ_TIME r1
|
||||
1: subs r4, r4, #1
|
||||
.rept 8
|
||||
vmov.u32 lr, d0[0]
|
||||
add lr, lr, lr
|
||||
.endr
|
||||
bgt 1b
|
||||
READ_TIME r2
|
||||
|
||||
subs r6, r6, #1
|
||||
sub r2, r2, r1
|
||||
cmpgt r2, #30 << 3 // assume context switch if it took over 30 cycles
|
||||
addle r3, r3, r2
|
||||
subsle ip, ip, #1
|
||||
bgt average_loop
|
||||
|
||||
// disable counters if we enabled them
|
||||
ands r0, r0, #1
|
||||
bleq cpu_disable_armv7_counter
|
||||
|
||||
lsr r0, r3, #5
|
||||
cmp r0, #10
|
||||
movgt r0, #0
|
||||
pop {r4-r6,pc}
|
||||
endfunc
|
||||
764
common/arm/dct-a.S
Normal file
764
common/arm/dct-a.S
Normal file
@@ -0,0 +1,764 @@
|
||||
/****************************************************************************
|
||||
* dct-a.S: arm transform and zigzag
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Martin Storsjo <martin@martin.st>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
|
||||
const scan4x4_frame, align=4
|
||||
.byte 0,1, 8,9, 2,3, 4,5
|
||||
.byte 2,3, 8,9, 16,17, 10,11
|
||||
.byte 12,13, 6,7, 14,15, 20,21
|
||||
.byte 10,11, 12,13, 6,7, 14,15
|
||||
endconst
|
||||
|
||||
.text
|
||||
|
||||
// sum = a + (b>>shift) sub = (a>>shift) - b
|
||||
.macro SUMSUB_SHR shift sum sub a b t0 t1
|
||||
vshr.s16 \t0, \b, #\shift
|
||||
vshr.s16 \t1, \a, #\shift
|
||||
vadd.s16 \sum, \a, \t0
|
||||
vsub.s16 \sub, \t1, \b
|
||||
.endm
|
||||
|
||||
// sum = (a>>shift) + b sub = a - (b>>shift)
|
||||
.macro SUMSUB_SHR2 shift sum sub a b t0 t1
|
||||
vshr.s16 \t0, \a, #\shift
|
||||
vshr.s16 \t1, \b, #\shift
|
||||
vadd.s16 \sum, \t0, \b
|
||||
vsub.s16 \sub, \a, \t1
|
||||
.endm
|
||||
|
||||
// a += 1.5*ma b -= 1.5*mb
|
||||
.macro SUMSUB_15 a b ma mb t0 t1
|
||||
vshr.s16 \t0, \ma, #1
|
||||
vshr.s16 \t1, \mb, #1
|
||||
vadd.s16 \t0, \t0, \ma
|
||||
vadd.s16 \t1, \t1, \mb
|
||||
vadd.s16 \a, \a, \t0
|
||||
vsub.s16 \b, \b, \t1
|
||||
.endm
|
||||
|
||||
|
||||
function dct4x4dc_neon
|
||||
vld1.64 {d0-d3}, [r0,:128]
|
||||
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
|
||||
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
|
||||
|
||||
vmov.s16 d31, #1
|
||||
HADAMARD 1, sumsub, q2, q3, q0, q1
|
||||
vtrn.32 d4, d5
|
||||
vadd.s16 d16, d4, d31
|
||||
vtrn.32 d6, d7
|
||||
vadd.s16 d17, d6, d31
|
||||
vrhadd.s16 d0, d4, d5
|
||||
vhsub.s16 d1, d16, d5
|
||||
vhsub.s16 d2, d17, d7
|
||||
vrhadd.s16 d3, d6, d7
|
||||
vst1.64 {d0-d3}, [r0,:128]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function idct4x4dc_neon
|
||||
vld1.64 {d0-d3}, [r0,:128]
|
||||
SUMSUB_ABCD d4, d5, d6, d7, d0, d1, d2, d3
|
||||
SUMSUB_ABCD d0, d2, d3, d1, d4, d6, d5, d7
|
||||
|
||||
HADAMARD 1, sumsub, q2, q3, q0, q1
|
||||
HADAMARD 2, sumsub, d0, d1, d4, d5
|
||||
HADAMARD 2, sumsub, d3, d2, d6, d7
|
||||
vst1.64 {d0-d3}, [r0,:128]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
|
||||
.macro DCT_1D d0 d1 d2 d3 d4 d5 d6 d7
|
||||
SUMSUB_AB \d1, \d6, \d5, \d6
|
||||
SUMSUB_AB \d3, \d7, \d4, \d7
|
||||
vadd.s16 \d0, \d3, \d1
|
||||
vadd.s16 \d4, \d7, \d7
|
||||
vadd.s16 \d5, \d6, \d6
|
||||
vsub.s16 \d2, \d3, \d1
|
||||
vadd.s16 \d1, \d4, \d6
|
||||
vsub.s16 \d3, \d7, \d5
|
||||
.endm
|
||||
|
||||
function sub4x4_dct_neon
|
||||
mov r3, #FENC_STRIDE
|
||||
mov ip, #FDEC_STRIDE
|
||||
vld1.32 {d0[]}, [r1,:32], r3
|
||||
vld1.32 {d1[]}, [r2,:32], ip
|
||||
vld1.32 {d2[]}, [r1,:32], r3
|
||||
vsubl.u8 q8, d0, d1
|
||||
vld1.32 {d3[]}, [r2,:32], ip
|
||||
vld1.32 {d4[]}, [r1,:32], r3
|
||||
vsubl.u8 q9, d2, d3
|
||||
vld1.32 {d5[]}, [r2,:32], ip
|
||||
vld1.32 {d6[]}, [r1,:32], r3
|
||||
vsubl.u8 q10, d4, d5
|
||||
vld1.32 {d7[]}, [r2,:32], ip
|
||||
vsubl.u8 q11, d6, d7
|
||||
|
||||
DCT_1D d0, d1, d2, d3, d16, d18, d20, d22
|
||||
TRANSPOSE4x4_16 d0, d1, d2, d3
|
||||
DCT_1D d4, d5, d6, d7, d0, d1, d2, d3
|
||||
vst1.64 {d4-d7}, [r0,:128]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function sub8x4_dct_neon, export=0
|
||||
vld1.64 {d0}, [r1,:64], r3
|
||||
vld1.64 {d1}, [r2,:64], ip
|
||||
vsubl.u8 q8, d0, d1
|
||||
vld1.64 {d2}, [r1,:64], r3
|
||||
vld1.64 {d3}, [r2,:64], ip
|
||||
vsubl.u8 q9, d2, d3
|
||||
vld1.64 {d4}, [r1,:64], r3
|
||||
vld1.64 {d5}, [r2,:64], ip
|
||||
vsubl.u8 q10, d4, d5
|
||||
vld1.64 {d6}, [r1,:64], r3
|
||||
vld1.64 {d7}, [r2,:64], ip
|
||||
vsubl.u8 q11, d6, d7
|
||||
|
||||
DCT_1D q0, q1, q2, q3, q8, q9, q10, q11
|
||||
TRANSPOSE4x4_16 q0, q1, q2, q3
|
||||
|
||||
SUMSUB_AB q8, q12, q0, q3
|
||||
SUMSUB_AB q9, q10, q1, q2
|
||||
vadd.i16 q13, q12, q12
|
||||
vadd.i16 q11, q10, q10
|
||||
vadd.i16 d0, d16, d18
|
||||
vadd.i16 d1, d26, d20
|
||||
vsub.i16 d2, d16, d18
|
||||
vsub.i16 d3, d24, d22
|
||||
vst1.64 {d0-d1}, [r0,:128]!
|
||||
vadd.i16 d4, d17, d19
|
||||
vadd.i16 d5, d27, d21
|
||||
vst1.64 {d2-d3}, [r0,:128]!
|
||||
vsub.i16 d6, d17, d19
|
||||
vsub.i16 d7, d25, d23
|
||||
vst1.64 {d4-d5}, [r0,:128]!
|
||||
vst1.64 {d6-d7}, [r0,:128]!
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function sub8x8_dct_neon
|
||||
push {lr}
|
||||
mov r3, #FENC_STRIDE
|
||||
mov ip, #FDEC_STRIDE
|
||||
bl sub8x4_dct_neon
|
||||
pop {lr}
|
||||
b sub8x4_dct_neon
|
||||
endfunc
|
||||
|
||||
function sub16x16_dct_neon
|
||||
push {lr}
|
||||
mov r3, #FENC_STRIDE
|
||||
mov ip, #FDEC_STRIDE
|
||||
bl sub8x4_dct_neon
|
||||
bl sub8x4_dct_neon
|
||||
sub r1, r1, #8*FENC_STRIDE-8
|
||||
sub r2, r2, #8*FDEC_STRIDE-8
|
||||
bl sub8x4_dct_neon
|
||||
bl sub8x4_dct_neon
|
||||
sub r1, r1, #8
|
||||
sub r2, r2, #8
|
||||
bl sub8x4_dct_neon
|
||||
bl sub8x4_dct_neon
|
||||
sub r1, r1, #8*FENC_STRIDE-8
|
||||
sub r2, r2, #8*FDEC_STRIDE-8
|
||||
bl sub8x4_dct_neon
|
||||
pop {lr}
|
||||
b sub8x4_dct_neon
|
||||
endfunc
|
||||
|
||||
|
||||
.macro DCT8_1D type
|
||||
SUMSUB_AB q2, q1, q11, q12 // s34/d34
|
||||
SUMSUB_AB q3, q11, q10, q13 // s25/d25
|
||||
SUMSUB_AB q13, q10, q9, q14 // s16/d16
|
||||
SUMSUB_AB q14, q8, q8, q15 // s07/d07
|
||||
|
||||
SUMSUB_AB q9, q2, q14, q2 // a0/a2
|
||||
SUMSUB_AB q12, q14, q13, q3 // a1/a3
|
||||
|
||||
SUMSUB_AB q3, q13, q8, q1 // a6/a5
|
||||
vshr.s16 q0, q10, #1
|
||||
vshr.s16 q15, q11, #1
|
||||
vadd.s16 q0, q0, q10
|
||||
vadd.s16 q15, q15, q11
|
||||
vsub.s16 q3, q3, q0
|
||||
vsub.s16 q13, q13, q15
|
||||
|
||||
SUMSUB_AB q0, q15, q10, q11 // a4/a7
|
||||
vshr.s16 q10, q8, #1
|
||||
vshr.s16 q11, q1, #1
|
||||
vadd.s16 q10, q10, q8
|
||||
vadd.s16 q11, q11, q1
|
||||
vadd.s16 q10, q0, q10
|
||||
vadd.s16 q15, q15, q11
|
||||
|
||||
SUMSUB_AB q8, q12, q9, q12
|
||||
SUMSUB_SHR 2, q9, q15, q10, q15, q0, q1
|
||||
SUMSUB_SHR 1, q10, q14, q2, q14, q0, q1
|
||||
SUMSUB_SHR2 2, q11, q13, q3, q13, q0, q1
|
||||
.endm
|
||||
|
||||
function sub8x8_dct8_neon
|
||||
mov r3, #FENC_STRIDE
|
||||
mov ip, #FDEC_STRIDE
|
||||
vld1.64 {d16}, [r1,:64], r3
|
||||
vld1.64 {d17}, [r2,:64], ip
|
||||
vsubl.u8 q8, d16, d17
|
||||
vld1.64 {d18}, [r1,:64], r3
|
||||
vld1.64 {d19}, [r2,:64], ip
|
||||
vsubl.u8 q9, d18, d19
|
||||
vld1.64 {d20}, [r1,:64], r3
|
||||
vld1.64 {d21}, [r2,:64], ip
|
||||
vsubl.u8 q10, d20, d21
|
||||
vld1.64 {d22}, [r1,:64], r3
|
||||
vld1.64 {d23}, [r2,:64], ip
|
||||
vsubl.u8 q11, d22, d23
|
||||
vld1.64 {d24}, [r1,:64], r3
|
||||
vld1.64 {d25}, [r2,:64], ip
|
||||
vsubl.u8 q12, d24, d25
|
||||
vld1.64 {d26}, [r1,:64], r3
|
||||
vld1.64 {d27}, [r2,:64], ip
|
||||
vsubl.u8 q13, d26, d27
|
||||
vld1.64 {d28}, [r1,:64], r3
|
||||
vld1.64 {d29}, [r2,:64], ip
|
||||
vsubl.u8 q14, d28, d29
|
||||
vld1.64 {d30}, [r1,:64], r3
|
||||
vld1.64 {d31}, [r2,:64], ip
|
||||
vsubl.u8 q15, d30, d31
|
||||
|
||||
DCT8_1D row
|
||||
vswp d17, d24 // 8, 12
|
||||
vswp d21, d28 // 10,14
|
||||
vtrn.32 q8, q10
|
||||
vtrn.32 q12, q14
|
||||
|
||||
vswp d19, d26 // 9, 13
|
||||
vswp d23, d30 // 11,15
|
||||
vtrn.32 q9, q11
|
||||
vtrn.32 q13, q15
|
||||
|
||||
vtrn.16 q10, q11
|
||||
vtrn.16 q12, q13
|
||||
vtrn.16 q8, q9
|
||||
vtrn.16 q14, q15
|
||||
DCT8_1D col
|
||||
|
||||
vst1.64 {d16-d19}, [r0,:128]!
|
||||
vst1.64 {d20-d23}, [r0,:128]!
|
||||
vst1.64 {d24-d27}, [r0,:128]!
|
||||
vst1.64 {d28-d31}, [r0,:128]!
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function sub16x16_dct8_neon
|
||||
push {lr}
|
||||
bl X(sub8x8_dct8_neon)
|
||||
sub r1, r1, #FENC_STRIDE*8 - 8
|
||||
sub r2, r2, #FDEC_STRIDE*8 - 8
|
||||
bl X(sub8x8_dct8_neon)
|
||||
sub r1, r1, #8
|
||||
sub r2, r2, #8
|
||||
bl X(sub8x8_dct8_neon)
|
||||
pop {lr}
|
||||
sub r1, r1, #FENC_STRIDE*8 - 8
|
||||
sub r2, r2, #FDEC_STRIDE*8 - 8
|
||||
b X(sub8x8_dct8_neon)
|
||||
endfunc
|
||||
|
||||
|
||||
// First part of IDCT (minus final SUMSUB_BA)
|
||||
.macro IDCT_1D d4 d5 d6 d7 d0 d1 d2 d3
|
||||
SUMSUB_AB \d4, \d5, \d0, \d2
|
||||
vshr.s16 \d7, \d1, #1
|
||||
vshr.s16 \d6, \d3, #1
|
||||
vsub.s16 \d7, \d7, \d3
|
||||
vadd.s16 \d6, \d6, \d1
|
||||
.endm
|
||||
|
||||
function add4x4_idct_neon
|
||||
mov r2, #FDEC_STRIDE
|
||||
vld1.64 {d0-d3}, [r1,:128]
|
||||
|
||||
IDCT_1D d4, d5, d6, d7, d0, d1, d2, d3
|
||||
vld1.32 {d30[0]}, [r0,:32], r2
|
||||
SUMSUB_AB q0, q1, q2, q3
|
||||
|
||||
TRANSPOSE4x4_16 d0, d1, d3, d2
|
||||
|
||||
IDCT_1D d4, d5, d6, d7, d0, d1, d3, d2
|
||||
vld1.32 {d30[1]}, [r0,:32], r2
|
||||
SUMSUB_AB q0, q1, q2, q3
|
||||
|
||||
vrshr.s16 q0, q0, #6
|
||||
vld1.32 {d31[1]}, [r0,:32], r2
|
||||
vrshr.s16 q1, q1, #6
|
||||
vld1.32 {d31[0]}, [r0,:32], r2
|
||||
|
||||
sub r0, r0, r2, lsl #2
|
||||
vaddw.u8 q0, q0, d30
|
||||
vaddw.u8 q1, q1, d31
|
||||
vqmovun.s16 d0, q0
|
||||
vqmovun.s16 d2, q1
|
||||
|
||||
vst1.32 {d0[0]}, [r0,:32], r2
|
||||
vst1.32 {d0[1]}, [r0,:32], r2
|
||||
vst1.32 {d2[1]}, [r0,:32], r2
|
||||
vst1.32 {d2[0]}, [r0,:32], r2
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function add8x4_idct_neon, export=0
|
||||
vld1.64 {d0-d3}, [r1,:128]!
|
||||
IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3
|
||||
vld1.64 {d4-d7}, [r1,:128]!
|
||||
IDCT_1D d17, d19, d21, d23, d4, d5, d6, d7
|
||||
SUMSUB_AB q0, q3, q8, q10
|
||||
SUMSUB_AB q1, q2, q9, q11
|
||||
|
||||
TRANSPOSE4x4_16 q0, q1, q2, q3
|
||||
|
||||
IDCT_1D q8, q9, q10, q11, q0, q1, q2, q3
|
||||
SUMSUB_AB q0, q3, q8, q10
|
||||
SUMSUB_AB q1, q2, q9, q11
|
||||
|
||||
vrshr.s16 q0, q0, #6
|
||||
vld1.32 {d28}, [r0,:64], r2
|
||||
vrshr.s16 q1, q1, #6
|
||||
vld1.32 {d29}, [r0,:64], r2
|
||||
vrshr.s16 q2, q2, #6
|
||||
vld1.32 {d30}, [r0,:64], r2
|
||||
vrshr.s16 q3, q3, #6
|
||||
vld1.32 {d31}, [r0,:64], r2
|
||||
|
||||
sub r0, r0, r2, lsl #2
|
||||
vaddw.u8 q0, q0, d28
|
||||
vaddw.u8 q1, q1, d29
|
||||
vaddw.u8 q2, q2, d30
|
||||
vaddw.u8 q3, q3, d31
|
||||
|
||||
vqmovun.s16 d0, q0
|
||||
vqmovun.s16 d1, q1
|
||||
vst1.32 {d0}, [r0,:64], r2
|
||||
vqmovun.s16 d2, q2
|
||||
vst1.32 {d1}, [r0,:64], r2
|
||||
vqmovun.s16 d3, q3
|
||||
vst1.32 {d2}, [r0,:64], r2
|
||||
vst1.32 {d3}, [r0,:64], r2
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function add8x8_idct_neon
|
||||
mov r2, #FDEC_STRIDE
|
||||
mov ip, lr
|
||||
bl add8x4_idct_neon
|
||||
mov lr, ip
|
||||
b add8x4_idct_neon
|
||||
endfunc
|
||||
|
||||
function add16x16_idct_neon
|
||||
mov r2, #FDEC_STRIDE
|
||||
mov ip, lr
|
||||
bl add8x4_idct_neon
|
||||
bl add8x4_idct_neon
|
||||
sub r0, r0, #8*FDEC_STRIDE-8
|
||||
bl add8x4_idct_neon
|
||||
bl add8x4_idct_neon
|
||||
sub r0, r0, #8
|
||||
bl add8x4_idct_neon
|
||||
bl add8x4_idct_neon
|
||||
sub r0, r0, #8*FDEC_STRIDE-8
|
||||
bl add8x4_idct_neon
|
||||
mov lr, ip
|
||||
b add8x4_idct_neon
|
||||
endfunc
|
||||
|
||||
|
||||
.macro IDCT8_1D type
|
||||
.ifc \type, col
|
||||
vswp d21, d28
|
||||
.endif
|
||||
SUMSUB_AB q0, q1, q8, q12 // a0/a2
|
||||
.ifc \type, row
|
||||
vld1.64 {d28-d31}, [r1,:128]!
|
||||
.else
|
||||
vswp d19, d26
|
||||
.endif
|
||||
SUMSUB_SHR 1, q2, q3, q10, q14, q8, q12 // a6/a4
|
||||
.ifc \type, col
|
||||
vswp d23, d30
|
||||
.endif
|
||||
SUMSUB_AB q8, q10, q13, q11
|
||||
SUMSUB_15 q8, q10, q9, q15, q12, q14 // a7/a1
|
||||
SUMSUB_AB q14, q15, q15, q9
|
||||
SUMSUB_15 q15, q14, q13, q11, q12, q9 // a5/a3
|
||||
|
||||
SUMSUB_SHR 2, q13, q14, q14, q15, q11, q9 // b3/b5
|
||||
SUMSUB_SHR2 2, q12, q15, q8, q10, q11, q9 // b1/b7
|
||||
|
||||
SUMSUB_AB q10, q2, q0, q2 // b0/b6
|
||||
SUMSUB_AB q11, q3, q1, q3 // b2/b4
|
||||
|
||||
SUMSUB_AB q8, q15, q10, q15
|
||||
SUMSUB_AB q9, q14, q11, q14
|
||||
SUMSUB_AB q10, q13, q3, q13
|
||||
.ifc \type, row
|
||||
vtrn.16 q8, q9
|
||||
.endif
|
||||
SUMSUB_AB q11, q12, q2, q12
|
||||
.endm
|
||||
|
||||
function add8x8_idct8_neon
|
||||
mov r2, #FDEC_STRIDE
|
||||
vld1.64 {d16-d19}, [r1,:128]!
|
||||
vld1.64 {d20-d23}, [r1,:128]!
|
||||
vld1.64 {d24-d27}, [r1,:128]!
|
||||
|
||||
IDCT8_1D row
|
||||
vtrn.16 q10, q11
|
||||
vtrn.16 q12, q13
|
||||
vtrn.16 q14, q15
|
||||
vtrn.32 q8, q10
|
||||
vtrn.32 q9, q11
|
||||
vtrn.32 q12, q14
|
||||
vtrn.32 q13, q15
|
||||
vswp d17, d24
|
||||
IDCT8_1D col
|
||||
|
||||
vld1.64 {d0}, [r0,:64], r2
|
||||
vrshr.s16 q8, q8, #6
|
||||
vld1.64 {d1}, [r0,:64], r2
|
||||
vrshr.s16 q9, q9, #6
|
||||
vld1.64 {d2}, [r0,:64], r2
|
||||
vrshr.s16 q10, q10, #6
|
||||
vld1.64 {d3}, [r0,:64], r2
|
||||
vrshr.s16 q11, q11, #6
|
||||
vld1.64 {d4}, [r0,:64], r2
|
||||
vrshr.s16 q12, q12, #6
|
||||
vld1.64 {d5}, [r0,:64], r2
|
||||
vrshr.s16 q13, q13, #6
|
||||
vld1.64 {d6}, [r0,:64], r2
|
||||
vrshr.s16 q14, q14, #6
|
||||
vld1.64 {d7}, [r0,:64], r2
|
||||
vrshr.s16 q15, q15, #6
|
||||
sub r0, r0, r2, lsl #3
|
||||
|
||||
vaddw.u8 q8, q8, d0
|
||||
vaddw.u8 q9, q9, d1
|
||||
vaddw.u8 q10, q10, d2
|
||||
vqmovun.s16 d0, q8
|
||||
vqmovun.s16 d1, q9
|
||||
vqmovun.s16 d2, q10
|
||||
vaddw.u8 q11, q11, d3
|
||||
vst1.64 {d0}, [r0,:64], r2
|
||||
vaddw.u8 q12, q12, d4
|
||||
vst1.64 {d1}, [r0,:64], r2
|
||||
vaddw.u8 q13, q13, d5
|
||||
vst1.64 {d2}, [r0,:64], r2
|
||||
vqmovun.s16 d3, q11
|
||||
vqmovun.s16 d4, q12
|
||||
vaddw.u8 q14, q14, d6
|
||||
vaddw.u8 q15, q15, d7
|
||||
vst1.64 {d3}, [r0,:64], r2
|
||||
vqmovun.s16 d5, q13
|
||||
vst1.64 {d4}, [r0,:64], r2
|
||||
vqmovun.s16 d6, q14
|
||||
vqmovun.s16 d7, q15
|
||||
vst1.64 {d5}, [r0,:64], r2
|
||||
vst1.64 {d6}, [r0,:64], r2
|
||||
vst1.64 {d7}, [r0,:64], r2
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function add16x16_idct8_neon
|
||||
mov ip, lr
|
||||
bl X(add8x8_idct8_neon)
|
||||
sub r0, r0, #8*FDEC_STRIDE-8
|
||||
bl X(add8x8_idct8_neon)
|
||||
sub r0, r0, #8
|
||||
bl X(add8x8_idct8_neon)
|
||||
sub r0, r0, #8*FDEC_STRIDE-8
|
||||
mov lr, ip
|
||||
b X(add8x8_idct8_neon)
|
||||
endfunc
|
||||
|
||||
|
||||
function add8x8_idct_dc_neon
|
||||
mov r2, #FDEC_STRIDE
|
||||
vld1.64 {d16}, [r1,:64]
|
||||
vrshr.s16 d16, d16, #6
|
||||
vld1.64 {d0}, [r0,:64], r2
|
||||
vmov.i16 q15, #0
|
||||
vld1.64 {d1}, [r0,:64], r2
|
||||
vld1.64 {d2}, [r0,:64], r2
|
||||
vdup.16 d20, d16[0]
|
||||
vld1.64 {d3}, [r0,:64], r2
|
||||
vdup.16 d21, d16[1]
|
||||
vld1.64 {d4}, [r0,:64], r2
|
||||
vdup.16 d22, d16[2]
|
||||
vld1.64 {d5}, [r0,:64], r2
|
||||
vdup.16 d23, d16[3]
|
||||
vld1.64 {d6}, [r0,:64], r2
|
||||
vsub.s16 q12, q15, q10
|
||||
vld1.64 {d7}, [r0,:64], r2
|
||||
vsub.s16 q13, q15, q11
|
||||
|
||||
sub r0, r0, #8*FDEC_STRIDE
|
||||
|
||||
vqmovun.s16 d20, q10
|
||||
vqmovun.s16 d22, q11
|
||||
vqmovun.s16 d24, q12
|
||||
vqmovun.s16 d26, q13
|
||||
|
||||
vmov d21, d20
|
||||
vqadd.u8 q0, q0, q10
|
||||
vmov d23, d22
|
||||
vqadd.u8 q1, q1, q10
|
||||
vmov d25, d24
|
||||
vqadd.u8 q2, q2, q11
|
||||
vmov d27, d26
|
||||
vqadd.u8 q3, q3, q11
|
||||
vqsub.u8 q0, q0, q12
|
||||
vqsub.u8 q1, q1, q12
|
||||
vqsub.u8 q2, q2, q13
|
||||
|
||||
vst1.64 {d0}, [r0,:64], r2
|
||||
vqsub.u8 q3, q3, q13
|
||||
vst1.64 {d1}, [r0,:64], r2
|
||||
vst1.64 {d2}, [r0,:64], r2
|
||||
vst1.64 {d3}, [r0,:64], r2
|
||||
vst1.64 {d4}, [r0,:64], r2
|
||||
vst1.64 {d5}, [r0,:64], r2
|
||||
vst1.64 {d6}, [r0,:64], r2
|
||||
vst1.64 {d7}, [r0,:64], r2
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro ADD16x4_IDCT_DC dc
|
||||
vld1.64 {d16-d17}, [r0,:128], r3
|
||||
vld1.64 {d18-d19}, [r0,:128], r3
|
||||
vdup.16 d4, \dc[0]
|
||||
vdup.16 d5, \dc[1]
|
||||
vld1.64 {d20-d21}, [r0,:128], r3
|
||||
vdup.16 d6, \dc[2]
|
||||
vdup.16 d7, \dc[3]
|
||||
vld1.64 {d22-d23}, [r0,:128], r3
|
||||
vsub.s16 q12, q15, q2
|
||||
vsub.s16 q13, q15, q3
|
||||
|
||||
vqmovun.s16 d4, q2
|
||||
vqmovun.s16 d5, q3
|
||||
vqmovun.s16 d6, q12
|
||||
vqmovun.s16 d7, q13
|
||||
|
||||
vqadd.u8 q8, q8, q2
|
||||
vqadd.u8 q9, q9, q2
|
||||
vqadd.u8 q10, q10, q2
|
||||
vqadd.u8 q11, q11, q2
|
||||
|
||||
vqsub.u8 q8, q8, q3
|
||||
vqsub.u8 q9, q9, q3
|
||||
vqsub.u8 q10, q10, q3
|
||||
vst1.64 {d16-d17}, [r2,:128], r3
|
||||
vqsub.u8 q11, q11, q3
|
||||
vst1.64 {d18-d19}, [r2,:128], r3
|
||||
vst1.64 {d20-d21}, [r2,:128], r3
|
||||
vst1.64 {d22-d23}, [r2,:128], r3
|
||||
.endm
|
||||
|
||||
function add16x16_idct_dc_neon
|
||||
mov r2, r0
|
||||
mov r3, #FDEC_STRIDE
|
||||
vmov.i16 q15, #0
|
||||
|
||||
vld1.64 {d0-d3}, [r1,:64]
|
||||
vrshr.s16 q0, #6
|
||||
vrshr.s16 q1, #6
|
||||
|
||||
ADD16x4_IDCT_DC d0
|
||||
ADD16x4_IDCT_DC d1
|
||||
ADD16x4_IDCT_DC d2
|
||||
ADD16x4_IDCT_DC d3
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function sub8x8_dct_dc_neon
|
||||
mov r3, #FENC_STRIDE
|
||||
mov ip, #FDEC_STRIDE
|
||||
vld1.64 {d16}, [r1,:64], r3
|
||||
vld1.64 {d17}, [r2,:64], ip
|
||||
vsubl.u8 q8, d16, d17
|
||||
vld1.64 {d18}, [r1,:64], r3
|
||||
vld1.64 {d19}, [r2,:64], ip
|
||||
vsubl.u8 q9, d18, d19
|
||||
vld1.64 {d20}, [r1,:64], r3
|
||||
vld1.64 {d21}, [r2,:64], ip
|
||||
vsubl.u8 q10, d20, d21
|
||||
vld1.64 {d22}, [r1,:64], r3
|
||||
vadd.s16 q0, q8, q9
|
||||
vld1.64 {d23}, [r2,:64], ip
|
||||
vsubl.u8 q11, d22, d23
|
||||
vld1.64 {d24}, [r1,:64], r3
|
||||
vadd.s16 q0, q0, q10
|
||||
vld1.64 {d25}, [r2,:64], ip
|
||||
vsubl.u8 q12, d24, d25
|
||||
vld1.64 {d26}, [r1,:64], r3
|
||||
vadd.s16 q0, q0, q11
|
||||
vld1.64 {d27}, [r2,:64], ip
|
||||
vsubl.u8 q13, d26, d27
|
||||
vld1.64 {d28}, [r1,:64], r3
|
||||
vld1.64 {d29}, [r2,:64], ip
|
||||
vsubl.u8 q14, d28, d29
|
||||
vld1.64 {d30}, [r1,:64], r3
|
||||
vadd.s16 q1, q12, q13
|
||||
vld1.64 {d31}, [r2,:64], ip
|
||||
vsubl.u8 q15, d30, d31
|
||||
vadd.s16 q1, q1, q14
|
||||
|
||||
vadd.s16 d4, d0, d1
|
||||
vadd.s16 q1, q1, q15
|
||||
vsub.s16 d5, d0, d1
|
||||
vadd.s16 d6, d2, d3
|
||||
vsub.s16 d7, d2, d3
|
||||
vadd.s16 q0, q2, q3
|
||||
vsub.s16 q1, q2, q3
|
||||
|
||||
vpadd.s16 d0, d0, d2
|
||||
vpadd.s16 d1, d1, d3
|
||||
vpadd.s16 d0, d0, d1
|
||||
vst1.64 {d0}, [r0,:64]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function sub8x16_dct_dc_neon
|
||||
mov r3, #FENC_STRIDE
|
||||
mov ip, #FDEC_STRIDE
|
||||
vld1.64 {d16}, [r1,:64], r3
|
||||
vld1.64 {d17}, [r2,:64], ip
|
||||
vsubl.u8 q8, d16, d17
|
||||
vld1.64 {d18}, [r1,:64], r3
|
||||
vld1.64 {d19}, [r2,:64], ip
|
||||
vsubl.u8 q9, d18, d19
|
||||
vld1.64 {d20}, [r1,:64], r3
|
||||
vld1.64 {d21}, [r2,:64], ip
|
||||
vsubl.u8 q10, d20, d21
|
||||
vld1.64 {d22}, [r1,:64], r3
|
||||
vadd.s16 q0, q8, q9
|
||||
vld1.64 {d23}, [r2,:64], ip
|
||||
vsubl.u8 q11, d22, d23
|
||||
vld1.64 {d24}, [r1,:64], r3
|
||||
vadd.s16 q0, q0, q10
|
||||
vld1.64 {d25}, [r2,:64], ip
|
||||
vsubl.u8 q12, d24, d25
|
||||
vld1.64 {d26}, [r1,:64], r3
|
||||
vadd.s16 q0, q0, q11
|
||||
vld1.64 {d27}, [r2,:64], ip
|
||||
vsubl.u8 q13, d26, d27
|
||||
vld1.64 {d28}, [r1,:64], r3
|
||||
vld1.64 {d29}, [r2,:64], ip
|
||||
vsubl.u8 q14, d28, d29
|
||||
vld1.64 {d30}, [r1,:64], r3
|
||||
vadd.s16 q1, q12, q13
|
||||
vld1.64 {d31}, [r2,:64], ip
|
||||
vsubl.u8 q15, d30, d31
|
||||
|
||||
vld1.64 {d16}, [r1,:64], r3
|
||||
vadd.s16 q1, q1, q14
|
||||
vld1.64 {d17}, [r2,:64], ip
|
||||
vadd.s16 q1, q1, q15
|
||||
vld1.64 {d18}, [r1,:64], r3
|
||||
vsubl.u8 q8, d16, d17
|
||||
vld1.64 {d19}, [r2,:64], ip
|
||||
vsubl.u8 q9, d18, d19
|
||||
vld1.64 {d20}, [r1,:64], r3
|
||||
vld1.64 {d21}, [r2,:64], ip
|
||||
vsubl.u8 q10, d20, d21
|
||||
vld1.64 {d22}, [r1,:64], r3
|
||||
vadd.s16 q2, q8, q9
|
||||
vld1.64 {d23}, [r2,:64], ip
|
||||
vsubl.u8 q11, d22, d23
|
||||
vld1.64 {d24}, [r1,:64], r3
|
||||
vadd.s16 q2, q2, q10
|
||||
vld1.64 {d25}, [r2,:64], ip
|
||||
vsubl.u8 q12, d24, d25
|
||||
vld1.64 {d26}, [r1,:64], r3
|
||||
vadd.s16 q2, q2, q11
|
||||
vld1.64 {d27}, [r2,:64], ip
|
||||
vsubl.u8 q13, d26, d27
|
||||
vld1.64 {d28}, [r1,:64], r3
|
||||
vld1.64 {d29}, [r2,:64], ip
|
||||
vsubl.u8 q14, d28, d29
|
||||
vld1.64 {d30}, [r1,:64], r3
|
||||
vadd.s16 q3, q12, q13
|
||||
vld1.64 {d31}, [r2,:64], ip
|
||||
vsubl.u8 q15, d30, d31
|
||||
vadd.s16 q3, q3, q14
|
||||
|
||||
vadd.s16 d16, d0, d1 @ b0
|
||||
vadd.s16 q3, q3, q15
|
||||
vsub.s16 d17, d0, d1 @ b4
|
||||
vadd.s16 d18, d2, d3 @ b1
|
||||
vsub.s16 d19, d2, d3 @ b5
|
||||
vadd.s16 d20, d4, d5 @ b2
|
||||
vsub.s16 d21, d4, d5 @ b6
|
||||
vadd.s16 d22, d6, d7 @ b3
|
||||
vsub.s16 d23, d6, d7 @ b7
|
||||
vadd.s16 q0, q8, q9 @ b0 + b1, b4 + b5; a0, a2
|
||||
vsub.s16 q1, q8, q9 @ b0 - b1, b4 - b5; a4, a6
|
||||
vadd.s16 q2, q10, q11 @ b2 + b3, b6 + b7; a1, a3
|
||||
vsub.s16 q3, q10, q11 @ b2 - b3, b6 - b7; a5, a7
|
||||
|
||||
vadd.s16 q8, q0, q2 @ a0 + a1, a2 + a3
|
||||
vsub.s16 q9, q0, q2 @ a0 - a1, a2 - a3
|
||||
vsub.s16 q10, q1, q3 @ a4 - a5, a6 - a7
|
||||
vadd.s16 q11, q1, q3 @ a4 + a5, a6 + a7
|
||||
|
||||
vpadd.s16 d0, d16, d17
|
||||
vpadd.s16 d1, d18, d19
|
||||
vpadd.s16 d2, d20, d21
|
||||
vpadd.s16 d3, d22, d23
|
||||
vpadd.s16 d0, d0, d1
|
||||
vpadd.s16 d1, d2, d3
|
||||
vst1.64 {q0}, [r0,:64]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
|
||||
function zigzag_scan_4x4_frame_neon
|
||||
movrel r2, scan4x4_frame
|
||||
vld1.64 {d0-d3}, [r1,:128]
|
||||
vld1.64 {d16-d19}, [r2,:128]
|
||||
vtbl.8 d4, {d0-d1}, d16
|
||||
vtbl.8 d5, {d1-d3}, d17
|
||||
vtbl.8 d6, {d0-d2}, d18
|
||||
vtbl.8 d7, {d2-d3}, d19
|
||||
vst1.64 {d4-d7}, [r0,:128]
|
||||
bx lr
|
||||
endfunc
|
||||
70
common/arm/dct.h
Normal file
70
common/arm/dct.h
Normal file
@@ -0,0 +1,70 @@
|
||||
/*****************************************************************************
|
||||
* dct.h: arm transform and zigzag
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_ARM_DCT_H
|
||||
#define X264_ARM_DCT_H
|
||||
|
||||
#define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
|
||||
void x264_dct4x4dc_neon( int16_t d[16] );
|
||||
#define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
|
||||
void x264_idct4x4dc_neon( int16_t d[16] );
|
||||
|
||||
#define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
|
||||
void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
|
||||
void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
|
||||
void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
|
||||
#define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
|
||||
void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
|
||||
#define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
|
||||
void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
|
||||
#define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
|
||||
void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
|
||||
|
||||
#define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
|
||||
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
|
||||
#define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
|
||||
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
|
||||
#define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
|
||||
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
|
||||
void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
|
||||
|
||||
#define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
|
||||
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
|
||||
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
|
||||
|
||||
#define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
|
||||
void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
|
||||
#define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
|
||||
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
|
||||
|
||||
#define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
|
||||
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
|
||||
|
||||
#endif
|
||||
795
common/arm/deblock-a.S
Normal file
795
common/arm/deblock-a.S
Normal file
@@ -0,0 +1,795 @@
|
||||
/*****************************************************************************
|
||||
* deblock.S: arm deblocking
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: Mans Rullgard <mans@mansr.com>
|
||||
* Martin Storsjo <martin@martin.st>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
|
||||
.macro h264_loop_filter_start
|
||||
ldr ip, [sp]
|
||||
ldr ip, [ip]
|
||||
vdup.32 d24, ip
|
||||
and ip, ip, ip, lsl #16
|
||||
ands ip, ip, ip, lsl #8
|
||||
bxlt lr
|
||||
.endm
|
||||
|
||||
.macro align_push_regs
|
||||
and ip, sp, #15
|
||||
add ip, ip, #32
|
||||
sub sp, sp, ip
|
||||
vst1.64 {d12-d15}, [sp,:128]
|
||||
sub sp, sp, #32
|
||||
vst1.64 {d8-d11}, [sp,:128]
|
||||
.endm
|
||||
|
||||
.macro align_pop_regs
|
||||
vld1.64 {d8-d11}, [sp,:128]!
|
||||
vld1.64 {d12-d15}, [sp,:128], ip
|
||||
.endm
|
||||
|
||||
.macro h264_loop_filter_luma
|
||||
vdup.8 q11, r2 @ alpha
|
||||
vmovl.u8 q12, d24
|
||||
vabd.u8 q6, q8, q0 @ abs(p0 - q0)
|
||||
vmovl.u16 q12, d24
|
||||
vabd.u8 q14, q9, q8 @ abs(p1 - p0)
|
||||
vsli.16 q12, q12, #8
|
||||
vabd.u8 q15, q1, q0 @ abs(q1 - q0)
|
||||
vsli.32 q12, q12, #16
|
||||
vclt.u8 q6, q6, q11 @ < alpha
|
||||
vdup.8 q11, r3 @ beta
|
||||
vclt.s8 q7, q12, #0
|
||||
vclt.u8 q14, q14, q11 @ < beta
|
||||
vclt.u8 q15, q15, q11 @ < beta
|
||||
vbic q6, q6, q7
|
||||
vabd.u8 q4, q10, q8 @ abs(p2 - p0)
|
||||
vand q6, q6, q14
|
||||
vabd.u8 q5, q2, q0 @ abs(q2 - q0)
|
||||
vclt.u8 q4, q4, q11 @ < beta
|
||||
vand q6, q6, q15
|
||||
vclt.u8 q5, q5, q11 @ < beta
|
||||
vand q4, q4, q6
|
||||
vand q5, q5, q6
|
||||
vand q12, q12, q6
|
||||
vrhadd.u8 q14, q8, q0
|
||||
vsub.i8 q6, q12, q4
|
||||
vqadd.u8 q7, q9, q12
|
||||
vhadd.u8 q10, q10, q14
|
||||
vsub.i8 q6, q6, q5
|
||||
vhadd.u8 q14, q2, q14
|
||||
vmin.u8 q7, q7, q10
|
||||
vqsub.u8 q11, q9, q12
|
||||
vqadd.u8 q2, q1, q12
|
||||
vmax.u8 q7, q7, q11
|
||||
vqsub.u8 q11, q1, q12
|
||||
vmin.u8 q14, q2, q14
|
||||
vmovl.u8 q2, d0
|
||||
vmax.u8 q14, q14, q11
|
||||
vmovl.u8 q10, d1
|
||||
vsubw.u8 q2, q2, d16
|
||||
vsubw.u8 q10, q10, d17
|
||||
vshl.i16 q2, q2, #2
|
||||
vshl.i16 q10, q10, #2
|
||||
vaddw.u8 q2, q2, d18
|
||||
vaddw.u8 q10, q10, d19
|
||||
vsubw.u8 q2, q2, d2
|
||||
vsubw.u8 q10, q10, d3
|
||||
vrshrn.i16 d4, q2, #3
|
||||
vrshrn.i16 d5, q10, #3
|
||||
vbsl q4, q7, q9
|
||||
vbsl q5, q14, q1
|
||||
vneg.s8 q7, q6
|
||||
vmovl.u8 q14, d16
|
||||
vmin.s8 q2, q2, q6
|
||||
vmovl.u8 q6, d17
|
||||
vmax.s8 q2, q2, q7
|
||||
vmovl.u8 q11, d0
|
||||
vmovl.u8 q12, d1
|
||||
vaddw.s8 q14, q14, d4
|
||||
vaddw.s8 q6, q6, d5
|
||||
vsubw.s8 q11, q11, d4
|
||||
vsubw.s8 q12, q12, d5
|
||||
vqmovun.s16 d16, q14
|
||||
vqmovun.s16 d17, q6
|
||||
vqmovun.s16 d0, q11
|
||||
vqmovun.s16 d1, q12
|
||||
.endm
|
||||
|
||||
function deblock_v_luma_neon
|
||||
h264_loop_filter_start
|
||||
|
||||
vld1.64 {d0, d1}, [r0,:128], r1
|
||||
vld1.64 {d2, d3}, [r0,:128], r1
|
||||
vld1.64 {d4, d5}, [r0,:128], r1
|
||||
sub r0, r0, r1, lsl #2
|
||||
sub r0, r0, r1, lsl #1
|
||||
vld1.64 {d20,d21}, [r0,:128], r1
|
||||
vld1.64 {d18,d19}, [r0,:128], r1
|
||||
vld1.64 {d16,d17}, [r0,:128], r1
|
||||
|
||||
align_push_regs
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
vst1.64 {d8, d9}, [r0,:128], r1
|
||||
vst1.64 {d16,d17}, [r0,:128], r1
|
||||
vst1.64 {d0, d1}, [r0,:128], r1
|
||||
vst1.64 {d10,d11}, [r0,:128]
|
||||
|
||||
align_pop_regs
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function deblock_h_luma_neon
|
||||
h264_loop_filter_start
|
||||
|
||||
sub r0, r0, #4
|
||||
vld1.64 {d6}, [r0], r1
|
||||
vld1.64 {d20}, [r0], r1
|
||||
vld1.64 {d18}, [r0], r1
|
||||
vld1.64 {d16}, [r0], r1
|
||||
vld1.64 {d0}, [r0], r1
|
||||
vld1.64 {d2}, [r0], r1
|
||||
vld1.64 {d4}, [r0], r1
|
||||
vld1.64 {d26}, [r0], r1
|
||||
vld1.64 {d7}, [r0], r1
|
||||
vld1.64 {d21}, [r0], r1
|
||||
vld1.64 {d19}, [r0], r1
|
||||
vld1.64 {d17}, [r0], r1
|
||||
vld1.64 {d1}, [r0], r1
|
||||
vld1.64 {d3}, [r0], r1
|
||||
vld1.64 {d5}, [r0], r1
|
||||
vld1.64 {d27}, [r0], r1
|
||||
|
||||
TRANSPOSE8x8 q3, q10, q9, q8, q0, q1, q2, q13
|
||||
|
||||
align_push_regs
|
||||
|
||||
h264_loop_filter_luma
|
||||
|
||||
TRANSPOSE4x4 q4, q8, q0, q5
|
||||
|
||||
sub r0, r0, r1, lsl #4
|
||||
add r0, r0, #2
|
||||
vst1.32 {d8[0]}, [r0], r1
|
||||
vst1.32 {d16[0]}, [r0], r1
|
||||
vst1.32 {d0[0]}, [r0], r1
|
||||
vst1.32 {d10[0]}, [r0], r1
|
||||
vst1.32 {d8[1]}, [r0], r1
|
||||
vst1.32 {d16[1]}, [r0], r1
|
||||
vst1.32 {d0[1]}, [r0], r1
|
||||
vst1.32 {d10[1]}, [r0], r1
|
||||
vst1.32 {d9[0]}, [r0], r1
|
||||
vst1.32 {d17[0]}, [r0], r1
|
||||
vst1.32 {d1[0]}, [r0], r1
|
||||
vst1.32 {d11[0]}, [r0], r1
|
||||
vst1.32 {d9[1]}, [r0], r1
|
||||
vst1.32 {d17[1]}, [r0], r1
|
||||
vst1.32 {d1[1]}, [r0], r1
|
||||
vst1.32 {d11[1]}, [r0], r1
|
||||
|
||||
align_pop_regs
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_luma_intra
|
||||
vdup.8 q14, r2 @ alpha
|
||||
vabd.u8 q4, q8, q0 @ abs(p0 - q0)
|
||||
vabd.u8 q5, q9, q8 @ abs(p1 - p0)
|
||||
vabd.u8 q6, q1, q0 @ abs(q1 - q0)
|
||||
vdup.8 q15, r3 @ beta
|
||||
vmov.u8 q13, #2
|
||||
vclt.u8 q7, q4, q14 @ < alpha
|
||||
vshr.u8 q14, q14, #2 @ alpha >> 2
|
||||
vclt.u8 q5, q5, q15 @ < beta
|
||||
vadd.u8 q14, q14, q13 @ (alpha >> 2) + 2
|
||||
vand q7, q7, q5
|
||||
vclt.u8 q6, q6, q15 @ < beta
|
||||
vclt.u8 q13, q4, q14 @ < (alpha >> 2) + 2 if_2
|
||||
vand q12, q7, q6 @ if_1
|
||||
vshrn.u16 d28, q12, #4
|
||||
vmov r2, lr, d28
|
||||
orrs r2, r2, lr
|
||||
beq 9f
|
||||
|
||||
sub sp, sp, #32
|
||||
vst1.8 {q12-q13}, [sp,:128]
|
||||
|
||||
vshll.u8 q4, d18, #1 @ 2*p1
|
||||
vshll.u8 q5, d19, #1
|
||||
vaddw.u8 q4, q4, d16 @ 2*p1 + p0
|
||||
vaddw.u8 q5, q5, d17
|
||||
vaddw.u8 q4, q4, d2 @ 2*p1 + p0 + q1
|
||||
vaddw.u8 q5, q5, d3
|
||||
vrshrn.u16 d24, q4, #2
|
||||
vrshrn.u16 d25, q5, #2
|
||||
|
||||
vaddl.u8 q6, d20, d16 @ p2 + p0
|
||||
vaddl.u8 q7, d21, d17
|
||||
vaddw.u8 q6, q6, d0 @ p2 + p0 + q0
|
||||
vaddw.u8 q7, q7, d1
|
||||
vadd.u16 q4, q4, q6 @ p2 + 2*p1 + 2*p0 + q0 + q1
|
||||
vadd.u16 q5, q5, q7
|
||||
vaddw.u8 q4, q4, d0 @ p2 + 2*p1 + 2*p0 + 2*q0 + q1
|
||||
vaddw.u8 q5, q5, d1
|
||||
vrshrn.u16 d26, q4, #3 @ p0'_2
|
||||
vrshrn.u16 d27, q5, #3
|
||||
vaddw.u8 q6, q6, d18 @ p2 + p1 + p0 + q0
|
||||
vaddw.u8 q7, q7, d19
|
||||
vrshrn.u16 d28, q6, #2 @ p1'_2
|
||||
vrshrn.u16 d29, q7, #2
|
||||
vaddl.u8 q4, d22, d20 @ p3 + p2
|
||||
vaddl.u8 q5, d23, d21
|
||||
vshl.u16 q4, q4, #1 @ 2*p3 + 2*p2
|
||||
vshl.u16 q5, q5, #1
|
||||
vadd.u16 q4, q4, q6 @ 2*p3 + 3*p2 + p1 + p0 + q0
|
||||
vadd.u16 q5, q5, q7
|
||||
vrshrn.u16 d30, q4, #3 @ p2'_2
|
||||
vrshrn.u16 d31, q5, #3
|
||||
|
||||
vdup.8 q4, r3 @ beta
|
||||
vabd.u8 q5, q10, q8 @ abs(p2 - p0)
|
||||
vld1.8 {q6-q7}, [sp,:128] @ if_1, if_2
|
||||
vclt.u8 q5, q5, q4 @ < beta if_3
|
||||
|
||||
vand q7, q7, q5 @ if_2 && if_3
|
||||
vmvn q4, q7
|
||||
vand q7, q7, q6 @ if_1 && if_2 && if_3
|
||||
vand q6, q4, q6 @ if_1 && !(if_2 && if_3)
|
||||
|
||||
@ copy p0 to q15 so it can be clobbered
|
||||
vbit q10, q15, q7
|
||||
vmov q15, q8
|
||||
vbit q8, q12, q6
|
||||
|
||||
@ wait for q9 to clobber
|
||||
vshll.u8 q4, d2, #1 @ 2*q1
|
||||
vshll.u8 q5, d3, #1
|
||||
|
||||
vbit q8, q12, q6
|
||||
|
||||
vaddw.u8 q4, q4, d0 @ 2*q1 + q0
|
||||
vaddw.u8 q5, q5, d1
|
||||
|
||||
vbit q8, q13, q7
|
||||
|
||||
vaddw.u8 q4, q4, d18 @ 2*q1 + q0 + p1
|
||||
vaddw.u8 q5, q5, d19
|
||||
|
||||
vbit q9, q14, q7
|
||||
|
||||
vrshrn.u16 d24, q4, #2
|
||||
vrshrn.u16 d25, q5, #2
|
||||
|
||||
vaddl.u8 q6, d4, d0 @ q2 + q0
|
||||
vaddl.u8 q7, d5, d1
|
||||
vaddw.u8 q6, q6, d30 @ q2 + q0 + p0
|
||||
vaddw.u8 q7, q7, d31
|
||||
vadd.u16 q4, q4, q6 @ q2 + 2*q1 + 2*q0 + p0 + p1
|
||||
vadd.u16 q5, q5, q7
|
||||
vaddw.u8 q4, q4, d30 @ q2 + 2*q1 + 2*q0 + 2*p0 + p1
|
||||
vaddw.u8 q5, q5, d31
|
||||
vrshrn.u16 d26, q4, #3 @ q0'_2
|
||||
vrshrn.u16 d27, q5, #3
|
||||
vaddw.u8 q6, q6, d2 @ q2 + q1 + q0 + p0
|
||||
vaddw.u8 q7, q7, d3
|
||||
vrshrn.u16 d28, q6, #2 @ q1'_2
|
||||
vrshrn.u16 d29, q7, #2
|
||||
vaddl.u8 q4, d6, d4 @ q3 + q2
|
||||
vaddl.u8 q5, d7, d5
|
||||
vshl.u16 q4, q4, #1 @ 2*q3 + 2*q2
|
||||
vshl.u16 q5, q5, #1
|
||||
vadd.u16 q4, q4, q6 @ 2*q3 + 3*q2 + q1 + q0 + p0
|
||||
vadd.u16 q5, q5, q7
|
||||
vrshrn.u16 d30, q4, #3 @ q2'_2
|
||||
vrshrn.u16 d31, q5, #3
|
||||
|
||||
vdup.8 q4, r3 @ beta
|
||||
vabd.u8 q5, q2, q0 @ abs(q2 - q0)
|
||||
vld1.8 {q6-q7}, [sp,:128]! @ if_1, if_2
|
||||
vclt.u8 q5, q5, q4 @ < beta if_4
|
||||
|
||||
vand q7, q7, q5 @ if_2 && if_4
|
||||
vmvn q4, q7
|
||||
vand q7, q6, q7 @ if_1 && if_2 && if_4
|
||||
vand q6, q6, q4 @ if_1 && !(if_2 && if_4)
|
||||
|
||||
vbit q0, q12, q6
|
||||
vbit q1, q14, q7
|
||||
vbit q0, q13, q7
|
||||
vbit q2, q15, q7
|
||||
|
||||
.endm
|
||||
|
||||
function deblock_v_luma_intra_neon
|
||||
push {lr}
|
||||
vld1.64 {d0, d1}, [r0,:128], r1
|
||||
vld1.64 {d2, d3}, [r0,:128], r1
|
||||
vld1.64 {d4, d5}, [r0,:128], r1
|
||||
vld1.64 {d6, d7}, [r0,:128], r1
|
||||
sub r0, r0, r1, lsl #3
|
||||
vld1.64 {d22,d23}, [r0,:128], r1
|
||||
vld1.64 {d20,d21}, [r0,:128], r1
|
||||
vld1.64 {d18,d19}, [r0,:128], r1
|
||||
vld1.64 {d16,d17}, [r0,:128]
|
||||
|
||||
align_push_regs
|
||||
|
||||
h264_loop_filter_luma_intra
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
vst1.64 {d20,d21}, [r0,:128], r1
|
||||
vst1.64 {d18,d19}, [r0,:128], r1
|
||||
vst1.64 {d16,d17}, [r0,:128], r1
|
||||
vst1.64 {d0, d1}, [r0,:128], r1
|
||||
vst1.64 {d2, d3}, [r0,:128], r1
|
||||
vst1.64 {d4, d5}, [r0,:128]
|
||||
9:
|
||||
align_pop_regs
|
||||
pop {pc}
|
||||
endfunc
|
||||
|
||||
function deblock_h_luma_intra_neon
|
||||
push {lr}
|
||||
sub r0, r0, #4
|
||||
vld1.64 {d22}, [r0], r1
|
||||
vld1.64 {d20}, [r0], r1
|
||||
vld1.64 {d18}, [r0], r1
|
||||
vld1.64 {d16}, [r0], r1
|
||||
vld1.64 {d0}, [r0], r1
|
||||
vld1.64 {d2}, [r0], r1
|
||||
vld1.64 {d4}, [r0], r1
|
||||
vld1.64 {d6}, [r0], r1
|
||||
vld1.64 {d23}, [r0], r1
|
||||
vld1.64 {d21}, [r0], r1
|
||||
vld1.64 {d19}, [r0], r1
|
||||
vld1.64 {d17}, [r0], r1
|
||||
vld1.64 {d1}, [r0], r1
|
||||
vld1.64 {d3}, [r0], r1
|
||||
vld1.64 {d5}, [r0], r1
|
||||
vld1.64 {d7}, [r0], r1
|
||||
|
||||
TRANSPOSE8x8 q11, q10, q9, q8, q0, q1, q2, q3
|
||||
|
||||
align_push_regs
|
||||
|
||||
h264_loop_filter_luma_intra
|
||||
|
||||
TRANSPOSE8x8 q11, q10, q9, q8, q0, q1, q2, q3
|
||||
|
||||
sub r0, r0, r1, lsl #4
|
||||
vst1.64 {d22}, [r0], r1
|
||||
vst1.64 {d20}, [r0], r1
|
||||
vst1.64 {d18}, [r0], r1
|
||||
vst1.64 {d16}, [r0], r1
|
||||
vst1.64 {d0}, [r0], r1
|
||||
vst1.64 {d2}, [r0], r1
|
||||
vst1.64 {d4}, [r0], r1
|
||||
vst1.64 {d6}, [r0], r1
|
||||
vst1.64 {d23}, [r0], r1
|
||||
vst1.64 {d21}, [r0], r1
|
||||
vst1.64 {d19}, [r0], r1
|
||||
vst1.64 {d17}, [r0], r1
|
||||
vst1.64 {d1}, [r0], r1
|
||||
vst1.64 {d3}, [r0], r1
|
||||
vst1.64 {d5}, [r0], r1
|
||||
vst1.64 {d7}, [r0], r1
|
||||
9:
|
||||
align_pop_regs
|
||||
pop {pc}
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma
|
||||
vdup.8 q11, r2 // alpha
|
||||
vmovl.u8 q12, d24
|
||||
vabd.u8 q13, q8, q0 // abs(p0 - q0)
|
||||
vabd.u8 q14, q9, q8 // abs(p1 - p0)
|
||||
vsubl.u8 q2, d0, d16
|
||||
vsubl.u8 q3, d1, d17
|
||||
vsli.16 q12, q12, #8
|
||||
vshl.i16 q2, q2, #2
|
||||
vshl.i16 q3, q3, #2
|
||||
vabd.u8 q15, q1, q0 // abs(q1 - q0)
|
||||
vmovl.u8 q12, d24
|
||||
vaddw.u8 q2, q2, d18
|
||||
vaddw.u8 q3, q3, d19
|
||||
vclt.u8 q13, q13, q11 // < alpha
|
||||
vsubw.u8 q2, q2, d2
|
||||
vsubw.u8 q3, q3, d3
|
||||
vsli.16 q12, q12, #8
|
||||
vdup.8 q11, r3 // beta
|
||||
vclt.s8 q10, q12, #0
|
||||
vrshrn.i16 d4, q2, #3
|
||||
vrshrn.i16 d5, q3, #3
|
||||
vclt.u8 q14, q14, q11 // < beta
|
||||
vbic q13, q13, q10
|
||||
vclt.u8 q15, q15, q11 // < beta
|
||||
vand q13, q13, q14
|
||||
vneg.s8 q10, q12
|
||||
vand q13, q13, q15
|
||||
vmin.s8 q2, q2, q12
|
||||
vmovl.u8 q14, d16
|
||||
vand q2, q2, q13
|
||||
vmovl.u8 q15, d17
|
||||
vmax.s8 q2, q2, q10
|
||||
vmovl.u8 q11, d0
|
||||
vmovl.u8 q12, d1
|
||||
vaddw.s8 q14, q14, d4
|
||||
vaddw.s8 q15, q15, d5
|
||||
vsubw.s8 q11, q11, d4
|
||||
vsubw.s8 q12, q12, d5
|
||||
vqmovun.s16 d16, q14
|
||||
vqmovun.s16 d17, q15
|
||||
vqmovun.s16 d0, q11
|
||||
vqmovun.s16 d1, q12
|
||||
.endm
|
||||
|
||||
function deblock_v_chroma_neon
|
||||
h264_loop_filter_start
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
vld1.8 {d18,d19}, [r0,:128], r1
|
||||
vld1.8 {d16,d17}, [r0,:128], r1
|
||||
vld1.8 {d0, d1}, [r0,:128], r1
|
||||
vld1.8 {d2, d3}, [r0,:128]
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
vst1.8 {d16,d17}, [r0,:128], r1
|
||||
vst1.8 {d0, d1}, [r0,:128], r1
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function deblock_h_chroma_neon
|
||||
h264_loop_filter_start
|
||||
|
||||
sub r0, r0, #4
|
||||
deblock_h_chroma:
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d16}, [r0], r1
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d19}, [r0], r1
|
||||
vld1.8 {d17}, [r0], r1
|
||||
vld1.8 {d1}, [r0], r1
|
||||
vld1.8 {d3}, [r0], r1
|
||||
|
||||
TRANSPOSE4x4_16 q9, q8, q0, q1
|
||||
|
||||
h264_loop_filter_chroma
|
||||
|
||||
vtrn.16 q8, q0
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
add r0, r0, #2
|
||||
vst1.32 {d16[0]}, [r0], r1
|
||||
vst1.32 {d0[0]}, [r0], r1
|
||||
vst1.32 {d16[1]}, [r0], r1
|
||||
vst1.32 {d0[1]}, [r0], r1
|
||||
vst1.32 {d17[0]}, [r0], r1
|
||||
vst1.32 {d1[0]}, [r0], r1
|
||||
vst1.32 {d17[1]}, [r0], r1
|
||||
vst1.32 {d1[1]}, [r0], r1
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function deblock_h_chroma_422_neon
|
||||
h264_loop_filter_start
|
||||
push {lr}
|
||||
sub r0, r0, #4
|
||||
add r1, r1, r1
|
||||
bl deblock_h_chroma
|
||||
ldr ip, [sp, #4]
|
||||
ldr ip, [ip]
|
||||
vdup.32 d24, ip
|
||||
sub r0, r0, r1, lsl #3
|
||||
add r0, r0, r1, lsr #1
|
||||
sub r0, r0, #2
|
||||
pop {lr}
|
||||
b deblock_h_chroma
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma8
|
||||
vdup.8 d22, r2 @ alpha
|
||||
vmovl.u8 q12, d24
|
||||
vabd.u8 d26, d16, d0 @ abs(p0 - q0)
|
||||
vabd.u8 d28, d18, d16 @ abs(p1 - p0)
|
||||
vsubl.u8 q2, d0, d16
|
||||
vsli.16 d24, d24, #8
|
||||
vshl.i16 q2, q2, #2
|
||||
vabd.u8 d30, d2, d0 @ abs(q1 - q0)
|
||||
vaddw.u8 q2, q2, d18
|
||||
vclt.u8 d26, d26, d22 @ < alpha
|
||||
vsubw.u8 q2, q2, d2
|
||||
vdup.8 d22, r3 @ beta
|
||||
vclt.s8 d20, d24, #0
|
||||
vrshrn.i16 d4, q2, #3
|
||||
vclt.u8 d28, d28, d22 @ < beta
|
||||
vbic d26, d26, d20
|
||||
vclt.u8 d30, d30, d22 @ < beta
|
||||
vand d26, d26, d28
|
||||
vneg.s8 d20, d24
|
||||
vand d26, d26, d30
|
||||
vmin.s8 d4, d4, d24
|
||||
vmovl.u8 q14, d16
|
||||
vand d4, d4, d26
|
||||
vmax.s8 d4, d4, d20
|
||||
vmovl.u8 q11, d0
|
||||
vaddw.s8 q14, q14, d4
|
||||
vsubw.s8 q11, q11, d4
|
||||
vqmovun.s16 d16, q14
|
||||
vqmovun.s16 d0, q11
|
||||
.endm
|
||||
|
||||
function deblock_h_chroma_mbaff_neon
|
||||
h264_loop_filter_start
|
||||
|
||||
sub r0, r0, #4
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d16}, [r0], r1
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d2}, [r0], r1
|
||||
|
||||
TRANSPOSE4x4_16 d18, d16, d0, d2
|
||||
|
||||
h264_loop_filter_chroma8
|
||||
|
||||
vtrn.16 d16, d0
|
||||
|
||||
sub r0, r0, r1, lsl #2
|
||||
add r0, r0, #2
|
||||
vst1.32 {d16[0]}, [r0], r1
|
||||
vst1.32 {d0[0]}, [r0], r1
|
||||
vst1.32 {d16[1]}, [r0], r1
|
||||
vst1.32 {d0[1]}, [r0]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro h264_loop_filter_chroma_intra, width=16
|
||||
vdup.8 q11, r2 @ alpha
|
||||
vabd.u8 q13, q8, q0 @ abs(p0 - q0)
|
||||
vabd.u8 q14, q9, q8 @ abs(p1 - p0)
|
||||
vabd.u8 q15, q1, q0 @ abs(q1 - q0)
|
||||
vclt.u8 q13, q13, q11 @ < alpha
|
||||
vdup.8 q11, r3 @ beta
|
||||
vclt.u8 q14, q14, q11 @ < beta
|
||||
vclt.u8 q15, q15, q11 @ < beta
|
||||
vand q13, q13, q14
|
||||
vand q13, q13, q15
|
||||
|
||||
vshll.u8 q14, d18, #1
|
||||
vshll.u8 q2, d2, #1
|
||||
.ifc \width, 16
|
||||
vshll.u8 q15, d19, #1
|
||||
vshll.u8 q3, d3, #1
|
||||
vaddl.u8 q12, d17, d3
|
||||
vaddl.u8 q10, d1, d19
|
||||
.endif
|
||||
vaddl.u8 q11, d16, d2
|
||||
vaddl.u8 q1, d18, d0 @ or vaddw q2, to not clobber q1
|
||||
vadd.u16 q14, q14, q11
|
||||
vadd.u16 q2, q2, q1
|
||||
.ifc \width, 16
|
||||
vadd.u16 q15, q15, q12
|
||||
vadd.u16 q3, q3, q10
|
||||
.endif
|
||||
vqrshrn.u16 d28, q14, #2
|
||||
vqrshrn.u16 d4, q2, #2
|
||||
.ifc \width, 16
|
||||
vqrshrn.u16 d29, q15, #2
|
||||
vqrshrn.u16 d5, q3, #2
|
||||
.endif
|
||||
vbit q8, q14, q13
|
||||
vbit q0, q2, q13
|
||||
.endm
|
||||
|
||||
function deblock_v_chroma_intra_neon
|
||||
sub r0, r0, r1, lsl #1
|
||||
vld2.8 {d18,d19}, [r0,:128], r1
|
||||
vld2.8 {d16,d17}, [r0,:128], r1
|
||||
vld2.8 {d0, d1}, [r0,:128], r1
|
||||
vld2.8 {d2, d3}, [r0,:128]
|
||||
|
||||
h264_loop_filter_chroma_intra
|
||||
|
||||
sub r0, r0, r1, lsl #1
|
||||
vst2.8 {d16,d17}, [r0,:128], r1
|
||||
vst2.8 {d0, d1}, [r0,:128], r1
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function deblock_h_chroma_intra_neon
|
||||
sub r0, r0, #4
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d16}, [r0], r1
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d2}, [r0], r1
|
||||
vld1.8 {d19}, [r0], r1
|
||||
vld1.8 {d17}, [r0], r1
|
||||
vld1.8 {d1}, [r0], r1
|
||||
vld1.8 {d3}, [r0], r1
|
||||
|
||||
TRANSPOSE4x4_16 q9, q8, q0, q1
|
||||
|
||||
h264_loop_filter_chroma_intra
|
||||
|
||||
vtrn.16 q8, q0
|
||||
|
||||
sub r0, r0, r1, lsl #3
|
||||
add r0, r0, #2
|
||||
vst1.32 {d16[0]}, [r0], r1
|
||||
vst1.32 {d0[0]}, [r0], r1
|
||||
vst1.32 {d16[1]}, [r0], r1
|
||||
vst1.32 {d0[1]}, [r0], r1
|
||||
vst1.32 {d17[0]}, [r0], r1
|
||||
vst1.32 {d1[0]}, [r0], r1
|
||||
vst1.32 {d17[1]}, [r0], r1
|
||||
vst1.32 {d1[1]}, [r0], r1
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function deblock_h_chroma_422_intra_neon
|
||||
push {lr}
|
||||
bl X(deblock_h_chroma_intra_neon)
|
||||
add r0, r0, #2
|
||||
pop {lr}
|
||||
b X(deblock_h_chroma_intra_neon)
|
||||
endfunc
|
||||
|
||||
function deblock_h_chroma_intra_mbaff_neon
|
||||
sub r0, r0, #4
|
||||
vld1.8 {d18}, [r0], r1
|
||||
vld1.8 {d16}, [r0], r1
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d2}, [r0], r1
|
||||
|
||||
TRANSPOSE4x4_16 d18, d16, d0, d2
|
||||
|
||||
h264_loop_filter_chroma_intra width=8
|
||||
|
||||
vtrn.16 d16, d0
|
||||
|
||||
sub r0, r0, r1, lsl #2
|
||||
add r0, r0, #2
|
||||
vst1.32 {d16[0]}, [r0], r1
|
||||
vst1.32 {d0[0]}, [r0], r1
|
||||
vst1.32 {d16[1]}, [r0], r1
|
||||
vst1.32 {d0[1]}, [r0]
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function deblock_strength_neon
|
||||
ldr ip, [sp]
|
||||
vmov.i8 q8, #0
|
||||
lsl ip, ip, #8
|
||||
add r3, r3, #32
|
||||
sub ip, ip, #(1<<8)-3
|
||||
vmov.i8 q9, #0
|
||||
vdup.16 q10, ip
|
||||
ldr ip, [sp, #4]
|
||||
|
||||
lists:
|
||||
@ load bytes ref
|
||||
vld1.8 {d31}, [r1]!
|
||||
add r2, r2, #16
|
||||
vld1.8 {q1}, [r1]!
|
||||
vmov.i8 q0, #0
|
||||
vld1.8 {q2}, [r1]!
|
||||
vext.8 q3, q0, q1, #15
|
||||
vext.8 q0, q0, q2, #15
|
||||
vuzp.32 q1, q2
|
||||
vuzp.32 q3, q0
|
||||
vext.8 q1, q15, q2, #12
|
||||
|
||||
veor q0, q0, q2
|
||||
veor q1, q1, q2
|
||||
vorr q8, q8, q0
|
||||
vorr q9, q9, q1
|
||||
|
||||
vld1.16 {q11}, [r2,:128]! @ mv + 0x10
|
||||
vld1.16 {q3}, [r2,:128]! @ mv + 0x20
|
||||
vld1.16 {q12}, [r2,:128]! @ mv + 0x30
|
||||
vld1.16 {q2}, [r2,:128]! @ mv + 0x40
|
||||
vld1.16 {q13}, [r2,:128]! @ mv + 0x50
|
||||
vext.8 q3, q3, q12, #12
|
||||
vext.8 q2, q2, q13, #12
|
||||
vabd.s16 q0, q12, q3
|
||||
vld1.16 {q3}, [r2,:128]! @ mv + 0x60
|
||||
vabd.s16 q1, q13, q2
|
||||
vld1.16 {q14}, [r2,:128]! @ mv + 0x70
|
||||
vqmovn.u16 d0, q0
|
||||
vld1.16 {q2}, [r2,:128]! @ mv + 0x80
|
||||
vld1.16 {q15}, [r2,:128]! @ mv + 0x90
|
||||
vqmovn.u16 d1, q1
|
||||
vext.8 q3, q3, q14, #12
|
||||
vext.8 q2, q2, q15, #12
|
||||
vabd.s16 q3, q14, q3
|
||||
vabd.s16 q2, q15, q2
|
||||
vqmovn.u16 d2, q3
|
||||
vqmovn.u16 d3, q2
|
||||
|
||||
vqsub.u8 q0, q0, q10
|
||||
vqsub.u8 q1, q1, q10
|
||||
vqmovn.u16 d0, q0
|
||||
vqmovn.u16 d1, q1
|
||||
|
||||
vabd.s16 q1, q12, q13
|
||||
vorr q8, q8, q0
|
||||
|
||||
vabd.s16 q0, q11, q12
|
||||
vabd.s16 q2, q13, q14
|
||||
vabd.s16 q3, q14, q15
|
||||
vqmovn.u16 d0, q0
|
||||
vqmovn.u16 d1, q1
|
||||
vqmovn.u16 d2, q2
|
||||
vqmovn.u16 d3, q3
|
||||
|
||||
vqsub.u8 q0, q0, q10
|
||||
vqsub.u8 q1, q1, q10
|
||||
vqmovn.u16 d0, q0
|
||||
vqmovn.u16 d1, q1
|
||||
subs ip, ip, #1
|
||||
vorr q9, q9, q0
|
||||
beq lists
|
||||
|
||||
mov ip, #-32
|
||||
@ load bytes nnz
|
||||
vld1.8 {d31}, [r0]!
|
||||
vld1.8 {q1}, [r0]!
|
||||
vmov.i8 q0, #0
|
||||
vld1.8 {q2}, [r0]
|
||||
vext.8 q3, q0, q1, #15
|
||||
vext.8 q0, q0, q2, #15
|
||||
vuzp.32 q1, q2
|
||||
vuzp.32 q3, q0
|
||||
vext.8 q1, q15, q2, #12
|
||||
|
||||
vorr q0, q0, q2
|
||||
vorr q1, q1, q2
|
||||
vmov.u8 q10, #1
|
||||
vmin.u8 q0, q0, q10
|
||||
vmin.u8 q1, q1, q10
|
||||
vmin.u8 q8, q8, q10 @ mv ? 1 : 0
|
||||
vmin.u8 q9, q9, q10
|
||||
vadd.u8 q0, q0, q0 @ nnz ? 2 : 0
|
||||
vadd.u8 q1, q1, q1
|
||||
vmax.u8 q8, q8, q0
|
||||
vmax.u8 q9, q9, q1
|
||||
vzip.16 d16, d17
|
||||
vst1.8 {q9}, [r3,:128], ip @ bs[1]
|
||||
vtrn.8 d16, d17
|
||||
vtrn.32 d16, d17
|
||||
|
||||
vst1.8 {q8}, [r3,:128] @ bs[0]
|
||||
bx lr
|
||||
endfunc
|
||||
58
common/arm/deblock.h
Normal file
58
common/arm/deblock.h
Normal file
@@ -0,0 +1,58 @@
|
||||
/*****************************************************************************
|
||||
* deblock.h: arm deblocking
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2017-2025 x264 project
|
||||
*
|
||||
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_ARM_DEBLOCK_H
|
||||
#define X264_ARM_DEBLOCK_H
|
||||
|
||||
#define x264_deblock_v_luma_neon x264_template(deblock_v_luma_neon)
|
||||
void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_luma_neon x264_template(deblock_h_luma_neon)
|
||||
void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_v_chroma_neon x264_template(deblock_v_chroma_neon)
|
||||
void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_neon x264_template(deblock_h_chroma_neon)
|
||||
void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_strength_neon x264_template(deblock_strength_neon)
|
||||
void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
|
||||
int mvy_limit, int bframe );
|
||||
#define x264_deblock_h_chroma_422_neon x264_template(deblock_h_chroma_422_neon)
|
||||
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_mbaff_neon x264_template(deblock_h_chroma_mbaff_neon)
|
||||
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_intra_mbaff_neon x264_template(deblock_h_chroma_intra_mbaff_neon)
|
||||
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_intra_neon x264_template(deblock_h_chroma_intra_neon)
|
||||
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_422_intra_neon x264_template(deblock_h_chroma_422_intra_neon)
|
||||
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_v_chroma_intra_neon x264_template(deblock_v_chroma_intra_neon)
|
||||
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_luma_intra_neon x264_template(deblock_h_luma_intra_neon)
|
||||
void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
|
||||
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
|
||||
#endif
|
||||
1938
common/arm/mc-a.S
Normal file
1938
common/arm/mc-a.S
Normal file
File diff suppressed because it is too large
Load Diff
366
common/arm/mc-c.c
Normal file
366
common/arm/mc-c.c
Normal file
@@ -0,0 +1,366 @@
|
||||
/*****************************************************************************
|
||||
* mc-c.c: arm motion compensation
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common/common.h"
|
||||
#include "mc.h"
|
||||
|
||||
#define x264_prefetch_ref_arm x264_template(prefetch_ref_arm)
|
||||
void x264_prefetch_ref_arm( uint8_t *, intptr_t, int );
|
||||
#define x264_prefetch_fenc_arm x264_template(prefetch_fenc_arm)
|
||||
void x264_prefetch_fenc_arm( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
|
||||
#define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
|
||||
void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
|
||||
#define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
|
||||
void x264_memzero_aligned_neon( void *dst, size_t n );
|
||||
|
||||
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
|
||||
void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
|
||||
void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
|
||||
void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
|
||||
void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
|
||||
void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
|
||||
void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
|
||||
void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
|
||||
void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
|
||||
void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
|
||||
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
|
||||
void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
|
||||
void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
|
||||
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
|
||||
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
|
||||
|
||||
#define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
|
||||
void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
#define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
|
||||
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
|
||||
pixel *dstv, intptr_t i_dstv,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
#define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
|
||||
void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
|
||||
pixel *dstb, intptr_t i_dstb,
|
||||
pixel *dstc, intptr_t i_dstc,
|
||||
pixel *src, intptr_t i_src, int pw, int w, int h );
|
||||
#define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
|
||||
void x264_plane_copy_interleave_core_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *srcu, intptr_t i_srcu,
|
||||
pixel *srcv, intptr_t i_srcv, int w, int h );
|
||||
#define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
|
||||
void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
|
||||
pixel *src, intptr_t i_src, int w, int h );
|
||||
|
||||
#define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
|
||||
void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
|
||||
#define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
|
||||
void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||
#define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
|
||||
void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
|
||||
|
||||
#define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
|
||||
#define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
|
||||
#define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
|
||||
#define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
|
||||
#define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
|
||||
#define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
|
||||
#define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
|
||||
#define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
|
||||
#define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
|
||||
#define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
|
||||
#define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
|
||||
#define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
|
||||
#define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
|
||||
#define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
|
||||
#define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
|
||||
#define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
|
||||
#if !HIGH_BIT_DEPTH
|
||||
#define MC_WEIGHT(func)\
|
||||
void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
|
||||
void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
|
||||
void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
|
||||
void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
|
||||
\
|
||||
static weight_fn_t mc##func##_wtab_neon[6] =\
|
||||
{\
|
||||
x264_mc_weight_w4##func##_neon,\
|
||||
x264_mc_weight_w4##func##_neon,\
|
||||
x264_mc_weight_w8##func##_neon,\
|
||||
x264_mc_weight_w16##func##_neon,\
|
||||
x264_mc_weight_w16##func##_neon,\
|
||||
x264_mc_weight_w20##func##_neon,\
|
||||
};
|
||||
|
||||
MC_WEIGHT()
|
||||
MC_WEIGHT(_nodenom)
|
||||
MC_WEIGHT(_offsetadd)
|
||||
MC_WEIGHT(_offsetsub)
|
||||
#endif
|
||||
|
||||
#define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
|
||||
void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
#define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
|
||||
void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
#define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
|
||||
void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
#define x264_mc_copy_w16_aligned_neon x264_template(mc_copy_w16_aligned_neon)
|
||||
void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
|
||||
#define x264_mc_chroma_neon x264_template(mc_chroma_neon)
|
||||
void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
|
||||
#define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
|
||||
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
|
||||
|
||||
#define x264_hpel_filter_v_neon x264_template(hpel_filter_v_neon)
|
||||
void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
|
||||
#define x264_hpel_filter_c_neon x264_template(hpel_filter_c_neon)
|
||||
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
|
||||
#define x264_hpel_filter_h_neon x264_template(hpel_filter_h_neon)
|
||||
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
|
||||
|
||||
#define x264_integral_init4h_neon x264_template(integral_init4h_neon)
|
||||
void x264_integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
|
||||
#define x264_integral_init4v_neon x264_template(integral_init4v_neon)
|
||||
void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
|
||||
#define x264_integral_init8h_neon x264_template(integral_init8h_neon)
|
||||
void x264_integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
|
||||
#define x264_integral_init8v_neon x264_template(integral_init8v_neon)
|
||||
void x264_integral_init8v_neon( uint16_t *, intptr_t );
|
||||
|
||||
#define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
|
||||
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
|
||||
|
||||
#define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
|
||||
void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
|
||||
#define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
|
||||
void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
static void weight_cache_neon( x264_t *h, x264_weight_t *w )
|
||||
{
|
||||
if( w->i_scale == 1<<w->i_denom )
|
||||
{
|
||||
if( w->i_offset < 0 )
|
||||
{
|
||||
w->weightfn = mc_offsetsub_wtab_neon;
|
||||
w->cachea[0] = -w->i_offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
w->weightfn = mc_offsetadd_wtab_neon;
|
||||
w->cachea[0] = w->i_offset;
|
||||
}
|
||||
}
|
||||
else if( !w->i_denom )
|
||||
w->weightfn = mc_nodenom_wtab_neon;
|
||||
else
|
||||
w->weightfn = mc_wtab_neon;
|
||||
}
|
||||
|
||||
static void (* const pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
|
||||
{
|
||||
NULL,
|
||||
x264_pixel_avg2_w4_neon,
|
||||
x264_pixel_avg2_w8_neon,
|
||||
x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
|
||||
x264_pixel_avg2_w16_neon,
|
||||
x264_pixel_avg2_w20_neon,
|
||||
};
|
||||
|
||||
static void (* const mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
|
||||
{
|
||||
NULL,
|
||||
x264_mc_copy_w4_neon,
|
||||
x264_mc_copy_w8_neon,
|
||||
NULL,
|
||||
x264_mc_copy_w16_neon,
|
||||
};
|
||||
|
||||
static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride,
|
||||
uint8_t *src[4], intptr_t i_src_stride,
|
||||
int mvx, int mvy,
|
||||
int i_width, int i_height, const x264_weight_t *weight )
|
||||
{
|
||||
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
|
||||
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
|
||||
uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
|
||||
if( (mvy&3) == 3 ) // explicit if() to force conditional add
|
||||
src1 += i_src_stride;
|
||||
|
||||
if( qpel_idx & 5 ) /* qpel interpolation needed */
|
||||
{
|
||||
uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
|
||||
pixel_avg_wtab_neon[i_width>>2](
|
||||
dst, i_dst_stride, src1, i_src_stride,
|
||||
src2, i_height );
|
||||
if( weight->weightfn )
|
||||
weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
|
||||
}
|
||||
else if( weight->weightfn )
|
||||
weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
|
||||
else
|
||||
mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
|
||||
}
|
||||
|
||||
static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride,
|
||||
uint8_t *src[4], intptr_t i_src_stride,
|
||||
int mvx, int mvy,
|
||||
int i_width, int i_height, const x264_weight_t *weight )
|
||||
{
|
||||
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
|
||||
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
|
||||
uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset;
|
||||
if( (mvy&3) == 3 ) // explicit if() to force conditional add
|
||||
src1 += i_src_stride;
|
||||
|
||||
if( qpel_idx & 5 ) /* qpel interpolation needed */
|
||||
{
|
||||
uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
|
||||
pixel_avg_wtab_neon[i_width>>2](
|
||||
dst, *i_dst_stride, src1, i_src_stride,
|
||||
src2, i_height );
|
||||
if( weight->weightfn )
|
||||
weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
|
||||
return dst;
|
||||
}
|
||||
else if( weight->weightfn )
|
||||
{
|
||||
weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
|
||||
return dst;
|
||||
}
|
||||
else
|
||||
{
|
||||
*i_dst_stride = i_src_stride;
|
||||
return src1;
|
||||
}
|
||||
}
|
||||
|
||||
static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
|
||||
intptr_t stride, int width, int height, int16_t *buf )
|
||||
{
|
||||
intptr_t realign = (intptr_t)src & 15;
|
||||
src -= realign;
|
||||
dstv -= realign;
|
||||
dstc -= realign;
|
||||
dsth -= realign;
|
||||
width += realign;
|
||||
while( height-- )
|
||||
{
|
||||
x264_hpel_filter_v_neon( dstv, src, buf+8, stride, width );
|
||||
x264_hpel_filter_c_neon( dstc, buf+8, width );
|
||||
x264_hpel_filter_h_neon( dsth, src, width );
|
||||
dsth += stride;
|
||||
dstv += stride;
|
||||
dstc += stride;
|
||||
src += stride;
|
||||
}
|
||||
}
|
||||
|
||||
PLANE_COPY(16, neon)
|
||||
PLANE_COPY_SWAP(16, neon)
|
||||
PLANE_INTERLEAVE(neon)
|
||||
PROPAGATE_LIST(neon)
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
|
||||
void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf )
|
||||
{
|
||||
if( !(cpu&X264_CPU_ARMV6) )
|
||||
return;
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
pf->prefetch_fenc_420 = x264_prefetch_fenc_arm;
|
||||
pf->prefetch_fenc_422 = x264_prefetch_fenc_arm; /* FIXME */
|
||||
pf->prefetch_ref = x264_prefetch_ref_arm;
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
|
||||
if( !(cpu&X264_CPU_NEON) )
|
||||
return;
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
|
||||
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
|
||||
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
|
||||
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
|
||||
|
||||
pf->plane_copy = plane_copy_neon;
|
||||
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
|
||||
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
|
||||
pf->plane_copy_interleave = plane_copy_interleave_neon;
|
||||
pf->plane_copy_swap = plane_copy_swap_neon;
|
||||
|
||||
pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
|
||||
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
|
||||
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
|
||||
|
||||
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
|
||||
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
|
||||
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
|
||||
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
|
||||
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
|
||||
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
|
||||
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
|
||||
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
|
||||
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
|
||||
|
||||
pf->weight = mc_wtab_neon;
|
||||
pf->offsetadd = mc_offsetadd_wtab_neon;
|
||||
pf->offsetsub = mc_offsetsub_wtab_neon;
|
||||
pf->weight_cache = weight_cache_neon;
|
||||
|
||||
pf->mc_chroma = x264_mc_chroma_neon;
|
||||
pf->mc_luma = mc_luma_neon;
|
||||
pf->get_ref = get_ref_neon;
|
||||
pf->hpel_filter = hpel_filter_neon;
|
||||
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
|
||||
|
||||
pf->integral_init4h = x264_integral_init4h_neon;
|
||||
pf->integral_init8h = x264_integral_init8h_neon;
|
||||
pf->integral_init4v = x264_integral_init4v_neon;
|
||||
pf->integral_init8v = x264_integral_init8v_neon;
|
||||
|
||||
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
|
||||
pf->mbtree_propagate_list = mbtree_propagate_list_neon;
|
||||
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
|
||||
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
|
||||
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
|
||||
#ifndef SYS_MACOSX
|
||||
pf->memcpy_aligned = x264_memcpy_aligned_neon;
|
||||
#endif
|
||||
pf->memzero_aligned = x264_memzero_aligned_neon;
|
||||
}
|
||||
32
common/arm/mc.h
Normal file
32
common/arm/mc.h
Normal file
@@ -0,0 +1,32 @@
|
||||
/*****************************************************************************
|
||||
* mc.h: arm motion compensation
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_ARM_MC_H
|
||||
#define X264_ARM_MC_H
|
||||
|
||||
#define x264_mc_init_arm x264_template(mc_init_arm)
|
||||
void x264_mc_init_arm( uint32_t cpu, x264_mc_functions_t *pf );
|
||||
|
||||
#endif
|
||||
1535
common/arm/pixel-a.S
Normal file
1535
common/arm/pixel-a.S
Normal file
File diff suppressed because it is too large
Load Diff
160
common/arm/pixel.h
Normal file
160
common/arm/pixel.h
Normal file
@@ -0,0 +1,160 @@
|
||||
/*****************************************************************************
|
||||
* pixel.h: arm pixel metrics
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_ARM_PIXEL_H
|
||||
#define X264_ARM_PIXEL_H
|
||||
|
||||
#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
|
||||
#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
|
||||
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
|
||||
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
|
||||
#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
|
||||
#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
|
||||
#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
|
||||
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
|
||||
#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
|
||||
#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
|
||||
#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
|
||||
#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
|
||||
#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
|
||||
#define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
|
||||
#define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
|
||||
#define x264_pixel_sad_4x4_armv6 x264_template(pixel_sad_4x4_armv6)
|
||||
#define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
|
||||
#define x264_pixel_sad_4x8_armv6 x264_template(pixel_sad_4x8_armv6)
|
||||
#define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
|
||||
#define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
|
||||
#define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
|
||||
#define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
|
||||
#define x264_pixel_sad_aligned_16x16_neon x264_template(pixel_sad_aligned_16x16_neon)
|
||||
#define x264_pixel_sad_aligned_16x16_neon_dual x264_template(pixel_sad_aligned_16x16_neon_dual)
|
||||
#define x264_pixel_sad_aligned_16x8_neon x264_template(pixel_sad_aligned_16x8_neon)
|
||||
#define x264_pixel_sad_aligned_16x8_neon_dual x264_template(pixel_sad_aligned_16x8_neon_dual)
|
||||
#define x264_pixel_sad_aligned_4x4_neon x264_template(pixel_sad_aligned_4x4_neon)
|
||||
#define x264_pixel_sad_aligned_4x8_neon x264_template(pixel_sad_aligned_4x8_neon)
|
||||
#define x264_pixel_sad_aligned_8x16_neon x264_template(pixel_sad_aligned_8x16_neon)
|
||||
#define x264_pixel_sad_aligned_8x16_neon_dual x264_template(pixel_sad_aligned_8x16_neon_dual)
|
||||
#define x264_pixel_sad_aligned_8x4_neon x264_template(pixel_sad_aligned_8x4_neon)
|
||||
#define x264_pixel_sad_aligned_8x4_neon_dual x264_template(pixel_sad_aligned_8x4_neon_dual)
|
||||
#define x264_pixel_sad_aligned_8x8_neon x264_template(pixel_sad_aligned_8x8_neon)
|
||||
#define x264_pixel_sad_aligned_8x8_neon_dual x264_template(pixel_sad_aligned_8x8_neon_dual)
|
||||
#define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
|
||||
#define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
|
||||
#define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
|
||||
#define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
|
||||
#define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
|
||||
#define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
|
||||
#define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
|
||||
#define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
|
||||
#define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
|
||||
#define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
|
||||
#define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
|
||||
#define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
|
||||
#define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
|
||||
#define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
|
||||
#define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
|
||||
#define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
|
||||
#define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
|
||||
#define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
|
||||
#define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
|
||||
#define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
|
||||
#define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
|
||||
#define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
|
||||
#define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
|
||||
#define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
|
||||
#define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
|
||||
#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
|
||||
#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
|
||||
#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
|
||||
#define DECL_PIXELS( ret, name, suffix, args ) \
|
||||
ret x264_pixel_##name##_16x16_##suffix args;\
|
||||
ret x264_pixel_##name##_16x8_##suffix args;\
|
||||
ret x264_pixel_##name##_8x16_##suffix args;\
|
||||
ret x264_pixel_##name##_8x8_##suffix args;\
|
||||
ret x264_pixel_##name##_8x4_##suffix args;\
|
||||
ret x264_pixel_##name##_4x8_##suffix args;\
|
||||
ret x264_pixel_##name##_4x4_##suffix args;\
|
||||
|
||||
#define DECL_X1( name, suffix ) \
|
||||
DECL_PIXELS( int, name, suffix, ( uint8_t *, int, uint8_t *, int ) )
|
||||
|
||||
#define DECL_X4( name, suffix ) \
|
||||
DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
|
||||
DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
|
||||
|
||||
int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
|
||||
int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
|
||||
|
||||
DECL_X1( sad, neon )
|
||||
DECL_X1( sad_aligned, neon )
|
||||
DECL_X1( sad_aligned, neon_dual )
|
||||
DECL_X4( sad, neon )
|
||||
DECL_X1( satd, neon )
|
||||
DECL_X1( ssd, neon )
|
||||
|
||||
#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
|
||||
void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
|
||||
|
||||
#define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
|
||||
int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
|
||||
|
||||
#define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
|
||||
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
|
||||
#define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
|
||||
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
|
||||
#define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
|
||||
uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
|
||||
|
||||
#define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
|
||||
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
|
||||
#define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
|
||||
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
|
||||
#define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
|
||||
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
|
||||
#define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
|
||||
int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
|
||||
int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
|
||||
|
||||
#define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
|
||||
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
|
||||
#define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
|
||||
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
|
||||
#define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
|
||||
uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
|
||||
#define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
|
||||
uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
|
||||
|
||||
#define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
|
||||
void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
|
||||
const uint8_t *, intptr_t,
|
||||
int sums[2][4] );
|
||||
#define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
|
||||
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
|
||||
|
||||
#define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
|
||||
int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
|
||||
|
||||
#endif
|
||||
808
common/arm/predict-a.S
Normal file
808
common/arm/predict-a.S
Normal file
@@ -0,0 +1,808 @@
|
||||
/*****************************************************************************
|
||||
* predict.S: arm intra prediction
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Mans Rullgard <mans@mansr.com>
|
||||
* Martin Storsjo <martin@martin.st>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
|
||||
const p16weight, align=4
|
||||
.short 1,2,3,4,5,6,7,8
|
||||
endconst
|
||||
|
||||
.text
|
||||
|
||||
.macro ldcol.8 rd, rs, rt, n=8, hi=0
|
||||
.if \n == 8 || \hi == 0
|
||||
vld1.8 {\rd[0]}, [\rs], \rt
|
||||
vld1.8 {\rd[1]}, [\rs], \rt
|
||||
vld1.8 {\rd[2]}, [\rs], \rt
|
||||
vld1.8 {\rd[3]}, [\rs], \rt
|
||||
.endif
|
||||
.if \n == 8 || \hi == 1
|
||||
vld1.8 {\rd[4]}, [\rs], \rt
|
||||
vld1.8 {\rd[5]}, [\rs], \rt
|
||||
vld1.8 {\rd[6]}, [\rs], \rt
|
||||
vld1.8 {\rd[7]}, [\rs], \rt
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro ldcol.16 rd1, rd2, rs, rt, ru
|
||||
add \ru, \rs, \rt, lsl #3
|
||||
vld1.8 {\rd1[0]}, [\rs], \rt
|
||||
vld1.8 {\rd2[0]}, [\ru], \rt
|
||||
vld1.8 {\rd1[1]}, [\rs], \rt
|
||||
vld1.8 {\rd2[1]}, [\ru], \rt
|
||||
vld1.8 {\rd1[2]}, [\rs], \rt
|
||||
vld1.8 {\rd2[2]}, [\ru], \rt
|
||||
vld1.8 {\rd1[3]}, [\rs], \rt
|
||||
vld1.8 {\rd2[3]}, [\ru], \rt
|
||||
vld1.8 {\rd1[4]}, [\rs], \rt
|
||||
vld1.8 {\rd2[4]}, [\ru], \rt
|
||||
vld1.8 {\rd1[5]}, [\rs], \rt
|
||||
vld1.8 {\rd2[5]}, [\ru], \rt
|
||||
vld1.8 {\rd1[6]}, [\rs], \rt
|
||||
vld1.8 {\rd2[6]}, [\ru], \rt
|
||||
vld1.8 {\rd1[7]}, [\rs], \rt
|
||||
vld1.8 {\rd2[7]}, [\ru], \rt
|
||||
.endm
|
||||
|
||||
.macro add16x8 dq, dl, dh, rl, rh
|
||||
vaddl.u8 \dq, \rl, \rh
|
||||
vadd.u16 \dl, \dl, \dh
|
||||
vpadd.u16 \dl, \dl, \dl
|
||||
vpadd.u16 \dl, \dl, \dl
|
||||
.endm
|
||||
|
||||
|
||||
// because gcc doesn't believe in using the free shift in add
|
||||
function predict_4x4_h_armv6
|
||||
ldrb r1, [r0, #0*FDEC_STRIDE-1]
|
||||
ldrb r2, [r0, #1*FDEC_STRIDE-1]
|
||||
ldrb r3, [r0, #2*FDEC_STRIDE-1]
|
||||
ldrb ip, [r0, #3*FDEC_STRIDE-1]
|
||||
add r1, r1, r1, lsl #8
|
||||
add r2, r2, r2, lsl #8
|
||||
add r3, r3, r3, lsl #8
|
||||
add ip, ip, ip, lsl #8
|
||||
add r1, r1, r1, lsl #16
|
||||
str r1, [r0, #0*FDEC_STRIDE]
|
||||
add r2, r2, r2, lsl #16
|
||||
str r2, [r0, #1*FDEC_STRIDE]
|
||||
add r3, r3, r3, lsl #16
|
||||
str r3, [r0, #2*FDEC_STRIDE]
|
||||
add ip, ip, ip, lsl #16
|
||||
str ip, [r0, #3*FDEC_STRIDE]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_4x4_v_armv6
|
||||
ldr r1, [r0, #0 - 1 * FDEC_STRIDE]
|
||||
str r1, [r0, #0 + 0 * FDEC_STRIDE]
|
||||
str r1, [r0, #0 + 1 * FDEC_STRIDE]
|
||||
str r1, [r0, #0 + 2 * FDEC_STRIDE]
|
||||
str r1, [r0, #0 + 3 * FDEC_STRIDE]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_4x4_dc_armv6
|
||||
mov ip, #0
|
||||
ldr r1, [r0, #-FDEC_STRIDE]
|
||||
ldrb r2, [r0, #0*FDEC_STRIDE-1]
|
||||
ldrb r3, [r0, #1*FDEC_STRIDE-1]
|
||||
usad8 r1, r1, ip
|
||||
add r2, r2, #4
|
||||
ldrb ip, [r0, #2*FDEC_STRIDE-1]
|
||||
add r2, r2, r3
|
||||
ldrb r3, [r0, #3*FDEC_STRIDE-1]
|
||||
add r2, r2, ip
|
||||
add r2, r2, r3
|
||||
add r1, r1, r2
|
||||
lsr r1, r1, #3
|
||||
add r1, r1, r1, lsl #8
|
||||
add r1, r1, r1, lsl #16
|
||||
str r1, [r0, #0*FDEC_STRIDE]
|
||||
str r1, [r0, #1*FDEC_STRIDE]
|
||||
str r1, [r0, #2*FDEC_STRIDE]
|
||||
str r1, [r0, #3*FDEC_STRIDE]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_4x4_dc_top_neon
|
||||
mov r12, #FDEC_STRIDE
|
||||
sub r1, r0, #FDEC_STRIDE
|
||||
vld1.32 d1[], [r1,:32]
|
||||
vpaddl.u8 d1, d1
|
||||
vpadd.u16 d1, d1, d1
|
||||
vrshr.u16 d1, d1, #2
|
||||
vdup.8 d1, d1[0]
|
||||
vst1.32 d1[0], [r0,:32], r12
|
||||
vst1.32 d1[0], [r0,:32], r12
|
||||
vst1.32 d1[0], [r0,:32], r12
|
||||
vst1.32 d1[0], [r0,:32], r12
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
// return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
|
||||
.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
|
||||
uhadd8 \a1, \a1, \c1
|
||||
uhadd8 \a2, \a2, \c2
|
||||
uhadd8 \c1, \a1, \b1
|
||||
uhadd8 \c2, \a2, \b2
|
||||
eor \a1, \a1, \b1
|
||||
eor \a2, \a2, \b2
|
||||
and \a1, \a1, \pb_1
|
||||
and \a2, \a2, \pb_1
|
||||
uadd8 \a1, \a1, \c1
|
||||
uadd8 \a2, \a2, \c2
|
||||
.endm
|
||||
|
||||
function predict_4x4_ddr_armv6
|
||||
ldr r1, [r0, # -FDEC_STRIDE]
|
||||
ldrb r2, [r0, # -FDEC_STRIDE-1]
|
||||
ldrb r3, [r0, #0*FDEC_STRIDE-1]
|
||||
push {r4-r6,lr}
|
||||
add r2, r2, r1, lsl #8
|
||||
ldrb r4, [r0, #1*FDEC_STRIDE-1]
|
||||
add r3, r3, r2, lsl #8
|
||||
ldrb r5, [r0, #2*FDEC_STRIDE-1]
|
||||
ldrb r6, [r0, #3*FDEC_STRIDE-1]
|
||||
add r4, r4, r3, lsl #8
|
||||
add r5, r5, r4, lsl #8
|
||||
add r6, r6, r5, lsl #8
|
||||
ldr ip, =0x01010101
|
||||
PRED4x4_LOWPASS r1, r2, r3, r4, r5, r6, ip
|
||||
str r1, [r0, #0*FDEC_STRIDE]
|
||||
lsl r2, r1, #8
|
||||
lsl r3, r1, #16
|
||||
lsl r4, r4, #8
|
||||
lsl r5, r1, #24
|
||||
add r2, r2, r4, lsr #24
|
||||
str r2, [r0, #1*FDEC_STRIDE]
|
||||
add r3, r3, r4, lsr #16
|
||||
str r3, [r0, #2*FDEC_STRIDE]
|
||||
add r5, r5, r4, lsr #8
|
||||
str r5, [r0, #3*FDEC_STRIDE]
|
||||
pop {r4-r6,pc}
|
||||
endfunc
|
||||
|
||||
function predict_4x4_ddl_neon
|
||||
sub r0, #FDEC_STRIDE
|
||||
mov ip, #FDEC_STRIDE
|
||||
vld1.64 {d0}, [r0], ip
|
||||
vdup.8 d3, d0[7]
|
||||
vext.8 d1, d0, d0, #1
|
||||
vext.8 d2, d0, d3, #2
|
||||
vhadd.u8 d0, d0, d2
|
||||
vrhadd.u8 d0, d0, d1
|
||||
vst1.32 {d0[0]}, [r0,:32], ip
|
||||
vext.8 d1, d0, d0, #1
|
||||
vext.8 d2, d0, d0, #2
|
||||
vst1.32 {d1[0]}, [r0,:32], ip
|
||||
vext.8 d3, d0, d0, #3
|
||||
vst1.32 {d2[0]}, [r0,:32], ip
|
||||
vst1.32 {d3[0]}, [r0,:32], ip
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8_dc_neon
|
||||
mov ip, #0
|
||||
ldrd r2, r3, [r1, #8]
|
||||
push {r4-r5,lr}
|
||||
ldrd r4, r5, [r1, #16]
|
||||
lsl r3, r3, #8
|
||||
ldrb lr, [r1, #7]
|
||||
usad8 r2, r2, ip
|
||||
usad8 r3, r3, ip
|
||||
usada8 r2, r4, ip, r2
|
||||
add lr, lr, #8
|
||||
usada8 r3, r5, ip, r3
|
||||
add r2, r2, lr
|
||||
mov ip, #FDEC_STRIDE
|
||||
add r2, r2, r3
|
||||
lsr r2, r2, #4
|
||||
|
||||
vdup.8 d0, r2
|
||||
.rept 8
|
||||
vst1.64 {d0}, [r0,:64], ip
|
||||
.endr
|
||||
pop {r4-r5,pc}
|
||||
endfunc
|
||||
|
||||
function predict_8x8_h_neon
|
||||
add r1, r1, #7
|
||||
mov ip, #FDEC_STRIDE
|
||||
vld1.64 {d16}, [r1]
|
||||
vdup.8 d0, d16[7]
|
||||
vdup.8 d1, d16[6]
|
||||
vst1.64 {d0}, [r0,:64], ip
|
||||
vdup.8 d2, d16[5]
|
||||
vst1.64 {d1}, [r0,:64], ip
|
||||
vdup.8 d3, d16[4]
|
||||
vst1.64 {d2}, [r0,:64], ip
|
||||
vdup.8 d4, d16[3]
|
||||
vst1.64 {d3}, [r0,:64], ip
|
||||
vdup.8 d5, d16[2]
|
||||
vst1.64 {d4}, [r0,:64], ip
|
||||
vdup.8 d6, d16[1]
|
||||
vst1.64 {d5}, [r0,:64], ip
|
||||
vdup.8 d7, d16[0]
|
||||
vst1.64 {d6}, [r0,:64], ip
|
||||
vst1.64 {d7}, [r0,:64], ip
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8_v_neon
|
||||
add r1, r1, #16
|
||||
mov r12, #FDEC_STRIDE
|
||||
vld1.8 {d0}, [r1,:64]
|
||||
.rept 8
|
||||
vst1.8 {d0}, [r0,:64], r12
|
||||
.endr
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8_ddl_neon
|
||||
add r1, #16
|
||||
vld1.8 {d0, d1}, [r1,:128]
|
||||
vmov.i8 q3, #0
|
||||
vrev64.8 d2, d1
|
||||
vext.8 q8, q3, q0, #15
|
||||
vext.8 q2, q0, q1, #1
|
||||
vhadd.u8 q8, q2
|
||||
mov r12, #FDEC_STRIDE
|
||||
vrhadd.u8 q0, q8
|
||||
vext.8 d2, d0, d1, #1
|
||||
vext.8 d3, d0, d1, #2
|
||||
vst1.8 d2, [r0,:64], r12
|
||||
vext.8 d2, d0, d1, #3
|
||||
vst1.8 d3, [r0,:64], r12
|
||||
vext.8 d3, d0, d1, #4
|
||||
vst1.8 d2, [r0,:64], r12
|
||||
vext.8 d2, d0, d1, #5
|
||||
vst1.8 d3, [r0,:64], r12
|
||||
vext.8 d3, d0, d1, #6
|
||||
vst1.8 d2, [r0,:64], r12
|
||||
vext.8 d2, d0, d1, #7
|
||||
vst1.8 d3, [r0,:64], r12
|
||||
vst1.8 d2, [r0,:64], r12
|
||||
vst1.8 d1, [r0,:64], r12
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8_ddr_neon
|
||||
vld1.8 {d0-d3}, [r1,:128]
|
||||
vext.8 q2, q0, q1, #7
|
||||
vext.8 q3, q0, q1, #9
|
||||
|
||||
vhadd.u8 q2, q2, q3
|
||||
vrhadd.u8 d0, d1, d4
|
||||
vrhadd.u8 d1, d2, d5
|
||||
|
||||
add r0, #7*FDEC_STRIDE
|
||||
mov r12, #-1*FDEC_STRIDE
|
||||
|
||||
vext.8 d2, d0, d1, #1
|
||||
vst1.8 {d0}, [r0,:64], r12
|
||||
vext.8 d4, d0, d1, #2
|
||||
vst1.8 {d2}, [r0,:64], r12
|
||||
vext.8 d5, d0, d1, #3
|
||||
vst1.8 {d4}, [r0,:64], r12
|
||||
vext.8 d4, d0, d1, #4
|
||||
vst1.8 {d5}, [r0,:64], r12
|
||||
vext.8 d5, d0, d1, #5
|
||||
vst1.8 {d4}, [r0,:64], r12
|
||||
vext.8 d4, d0, d1, #6
|
||||
vst1.8 {d5}, [r0,:64], r12
|
||||
vext.8 d5, d0, d1, #7
|
||||
vst1.8 {d4}, [r0,:64], r12
|
||||
vst1.8 {d5}, [r0,:64], r12
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8_vl_neon
|
||||
add r1, #16
|
||||
mov r12, #FDEC_STRIDE
|
||||
|
||||
vld1.8 {d0, d1}, [r1,:128]
|
||||
vext.8 q1, q1, q0, #15
|
||||
vext.8 q2, q0, q2, #1
|
||||
|
||||
vrhadd.u8 q3, q0, q2
|
||||
|
||||
vhadd.u8 q1, q1, q2
|
||||
vrhadd.u8 q0, q0, q1
|
||||
|
||||
vext.8 d2, d0, d1, #1
|
||||
vst1.8 {d6}, [r0,:64], r12
|
||||
vext.8 d3, d6, d7, #1
|
||||
vst1.8 {d2}, [r0,:64], r12
|
||||
vext.8 d2, d0, d1, #2
|
||||
vst1.8 {d3}, [r0,:64], r12
|
||||
vext.8 d3, d6, d7, #2
|
||||
vst1.8 {d2}, [r0,:64], r12
|
||||
vext.8 d2, d0, d1, #3
|
||||
vst1.8 {d3}, [r0,:64], r12
|
||||
vext.8 d3, d6, d7, #3
|
||||
vst1.8 {d2}, [r0,:64], r12
|
||||
vext.8 d2, d0, d1, #4
|
||||
vst1.8 {d3}, [r0,:64], r12
|
||||
vst1.8 {d2}, [r0,:64], r12
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8_vr_neon
|
||||
add r1, #8
|
||||
mov r12, #FDEC_STRIDE
|
||||
vld1.8 {d4,d5}, [r1,:64]
|
||||
|
||||
vext.8 q1, q2, q2, #14
|
||||
vext.8 q0, q2, q2, #15
|
||||
|
||||
vhadd.u8 q3, q2, q1
|
||||
vrhadd.u8 q2, q2, q0
|
||||
vrhadd.u8 q0, q0, q3
|
||||
|
||||
vmov d2, d0
|
||||
|
||||
vst1.8 {d5}, [r0,:64], r12
|
||||
vuzp.8 d2, d0
|
||||
vst1.8 {d1}, [r0,:64], r12
|
||||
vext.8 d6, d0, d5, #7
|
||||
vext.8 d3, d2, d1, #7
|
||||
vst1.8 {d6}, [r0,:64], r12
|
||||
vst1.8 {d3}, [r0,:64], r12
|
||||
vext.8 d6, d0, d5, #6
|
||||
vext.8 d3, d2, d1, #6
|
||||
vst1.8 {d6}, [r0,:64], r12
|
||||
vst1.8 {d3}, [r0,:64], r12
|
||||
vext.8 d6, d0, d5, #5
|
||||
vext.8 d3, d2, d1, #5
|
||||
vst1.8 {d6}, [r0,:64], r12
|
||||
vst1.8 {d3}, [r0,:64], r12
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8_hd_neon
|
||||
mov r12, #FDEC_STRIDE
|
||||
add r1, #7
|
||||
|
||||
vld1.8 {d2,d3}, [r1]
|
||||
vext.8 q3, q1, q1, #1
|
||||
vext.8 q2, q1, q1, #2
|
||||
|
||||
vrhadd.u8 q8, q1, q3
|
||||
|
||||
vhadd.u8 q1, q2
|
||||
vrhadd.u8 q0, q1, q3
|
||||
|
||||
vzip.8 d16, d0
|
||||
|
||||
vext.8 d2, d0, d1, #6
|
||||
vext.8 d3, d0, d1, #4
|
||||
vst1.8 {d2}, [r0,:64], r12
|
||||
vext.8 d2, d0, d1, #2
|
||||
vst1.8 {d3}, [r0,:64], r12
|
||||
vst1.8 {d2}, [r0,:64], r12
|
||||
vext.8 d2, d16, d0, #6
|
||||
vst1.8 {d0}, [r0,:64], r12
|
||||
vext.8 d3, d16, d0, #4
|
||||
vst1.8 {d2}, [r0,:64], r12
|
||||
vext.8 d2, d16, d0, #2
|
||||
vst1.8 {d3}, [r0,:64], r12
|
||||
vst1.8 {d2}, [r0,:64], r12
|
||||
vst1.8 {d16}, [r0,:64], r12
|
||||
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8_hu_neon
|
||||
mov r12, #FDEC_STRIDE
|
||||
add r1, #7
|
||||
vld1.8 {d7}, [r1]
|
||||
vdup.8 d6, d7[0]
|
||||
vrev64.8 d7, d7
|
||||
|
||||
vext.8 d4, d7, d6, #2
|
||||
vext.8 d2, d7, d6, #1
|
||||
|
||||
vhadd.u8 d16, d7, d4
|
||||
vrhadd.u8 d0, d2, d7
|
||||
vrhadd.u8 d1, d16, d2
|
||||
|
||||
vzip.8 d0, d1
|
||||
|
||||
vdup.16 q1, d1[3]
|
||||
|
||||
vext.8 q2, q0, q1, #2
|
||||
vext.8 q3, q0, q1, #4
|
||||
vext.8 q8, q0, q1, #6
|
||||
vst1.8 {d0}, [r0,:64], r12
|
||||
vst1.8 {d4}, [r0,:64], r12
|
||||
vst1.8 {d6}, [r0,:64], r12
|
||||
vst1.8 {d16}, [r0,:64], r12
|
||||
|
||||
vst1.8 {d1}, [r0,:64], r12
|
||||
vst1.8 {d5}, [r0,:64], r12
|
||||
vst1.8 {d7}, [r0,:64], r12
|
||||
vst1.8 {d17}, [r0,:64]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8c_dc_top_neon
|
||||
sub r2, r0, #FDEC_STRIDE
|
||||
mov r1, #FDEC_STRIDE
|
||||
vld1.8 {d0}, [r2,:64]
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0, d0
|
||||
vrshrn.u16 d0, q0, #2
|
||||
vdup.8 d1, d0[1]
|
||||
vdup.8 d0, d0[0]
|
||||
vtrn.32 d0, d1
|
||||
b pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function predict_8x8c_dc_left_neon
|
||||
mov r1, #FDEC_STRIDE
|
||||
sub r2, r0, #1
|
||||
ldcol.8 d0, r2, r1
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0, d0
|
||||
vrshrn.u16 d0, q0, #2
|
||||
vdup.8 d1, d0[1]
|
||||
vdup.8 d0, d0[0]
|
||||
b pred8x8_dc_end
|
||||
endfunc
|
||||
|
||||
function predict_8x8c_dc_neon
|
||||
sub r2, r0, #FDEC_STRIDE
|
||||
mov r1, #FDEC_STRIDE
|
||||
vld1.8 {d0}, [r2,:64]
|
||||
sub r2, r0, #1
|
||||
ldcol.8 d1, r2, r1
|
||||
vtrn.32 d0, d1
|
||||
vpaddl.u8 q0, q0
|
||||
vpadd.u16 d0, d0, d1
|
||||
vpadd.u16 d1, d0, d0
|
||||
vrshrn.u16 d2, q0, #3
|
||||
vrshrn.u16 d3, q0, #2
|
||||
vdup.8 d0, d2[4]
|
||||
vdup.8 d1, d3[3]
|
||||
vdup.8 d4, d3[2]
|
||||
vdup.8 d5, d2[5]
|
||||
vtrn.32 q0, q2
|
||||
pred8x8_dc_end:
|
||||
add r2, r0, r1, lsl #2
|
||||
.rept 4
|
||||
vst1.8 {d0}, [r0,:64], r1
|
||||
vst1.8 {d1}, [r2,:64], r1
|
||||
.endr
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8c_h_neon
|
||||
sub r1, r0, #1
|
||||
mov ip, #FDEC_STRIDE
|
||||
.rept 4
|
||||
vld1.8 {d0[]}, [r1], ip
|
||||
vld1.8 {d2[]}, [r1], ip
|
||||
vst1.64 {d0}, [r0,:64], ip
|
||||
vst1.64 {d2}, [r0,:64], ip
|
||||
.endr
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8c_v_neon
|
||||
sub r0, r0, #FDEC_STRIDE
|
||||
mov ip, #FDEC_STRIDE
|
||||
vld1.64 {d0}, [r0,:64], ip
|
||||
.rept 8
|
||||
vst1.64 {d0}, [r0,:64], ip
|
||||
.endr
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x8c_p_neon
|
||||
sub r3, r0, #FDEC_STRIDE
|
||||
mov r1, #FDEC_STRIDE
|
||||
add r2, r3, #4
|
||||
sub r3, r3, #1
|
||||
vld1.32 {d0[0]}, [r3]
|
||||
vld1.32 {d2[0]}, [r2,:32], r1
|
||||
ldcol.8 d0, r3, r1, 4, hi=1
|
||||
add r3, r3, r1
|
||||
ldcol.8 d3, r3, r1, 4
|
||||
vaddl.u8 q8, d2, d3
|
||||
vrev32.8 d0, d0
|
||||
vtrn.32 d2, d3
|
||||
vsubl.u8 q2, d2, d0
|
||||
movrel r3, p16weight
|
||||
vld1.16 {q0}, [r3,:128]
|
||||
vmul.s16 d4, d4, d0
|
||||
vmul.s16 d5, d5, d0
|
||||
vpadd.i16 d4, d4, d5
|
||||
vpaddl.s16 d4, d4
|
||||
vshl.i32 d5, d4, #4
|
||||
vadd.s32 d4, d4, d5
|
||||
vrshrn.s32 d4, q2, #5
|
||||
mov r3, #0
|
||||
vtrn.16 d4, d5
|
||||
vadd.i16 d2, d4, d5
|
||||
vshl.i16 d3, d2, #2
|
||||
vrev64.16 d16, d16
|
||||
vsub.i16 d3, d3, d2
|
||||
vadd.i16 d16, d16, d0
|
||||
vshl.i16 d2, d16, #4
|
||||
vsub.i16 d2, d2, d3
|
||||
vext.16 q0, q0, q0, #7
|
||||
vmov.16 d0[0], r3
|
||||
vmul.i16 q0, q0, d4[0]
|
||||
vdup.16 q1, d2[0]
|
||||
vdup.16 q3, d5[0]
|
||||
vadd.i16 q1, q1, q0
|
||||
mov r3, #8
|
||||
1:
|
||||
vqshrun.s16 d0, q1, #5
|
||||
vadd.i16 q1, q1, q3
|
||||
vst1.8 {d0}, [r0,:64], r1
|
||||
subs r3, r3, #1
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
|
||||
function predict_8x16c_dc_top_neon
|
||||
sub r2, r0, #FDEC_STRIDE
|
||||
mov r1, #FDEC_STRIDE
|
||||
vld1.8 {d0}, [r2,:64]
|
||||
vpaddl.u8 d0, d0
|
||||
vpadd.u16 d0, d0, d0
|
||||
vrshrn.u16 d0, q0, #2
|
||||
vdup.8 d1, d0[1]
|
||||
vdup.8 d0, d0[0]
|
||||
vtrn.32 d0, d1
|
||||
|
||||
add r2, r0, r1, lsl #2
|
||||
.rept 4
|
||||
vst1.8 {d0}, [r0,:64], r1
|
||||
vst1.8 {d1}, [r2,:64], r1
|
||||
.endr
|
||||
add r2, r2, r1, lsl #2
|
||||
add r0, r0, r1, lsl #2
|
||||
.rept 4
|
||||
vst1.8 {d0}, [r0,:64], r1
|
||||
vst1.8 {d1}, [r2,:64], r1
|
||||
.endr
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x16c_h_neon
|
||||
sub r1, r0, #1
|
||||
mov ip, #FDEC_STRIDE
|
||||
.rept 8
|
||||
vld1.8 {d0[]}, [r1], ip
|
||||
vld1.8 {d2[]}, [r1], ip
|
||||
vst1.64 {d0}, [r0,:64], ip
|
||||
vst1.64 {d2}, [r0,:64], ip
|
||||
.endr
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_8x16c_p_neon
|
||||
sub r3, r0, #FDEC_STRIDE
|
||||
mov r1, #FDEC_STRIDE
|
||||
add r2, r3, #4
|
||||
sub r3, r3, #1
|
||||
vld1.32 {d0[0]}, [r3]
|
||||
vld1.32 {d2[0]}, [r2,:32], r1
|
||||
ldcol.8 d1, r3, r1
|
||||
add r3, r3, r1
|
||||
ldcol.8 d3, r3, r1
|
||||
vrev64.32 d16, d3
|
||||
vaddl.u8 q8, d2, d16
|
||||
vrev32.8 d0, d0
|
||||
vsubl.u8 q2, d2, d0
|
||||
vrev64.8 d1, d1
|
||||
vsubl.u8 q3, d3, d1
|
||||
movrel r3, p16weight
|
||||
vld1.16 {q0}, [r3,:128]
|
||||
vmul.s16 d4, d4, d0
|
||||
vmul.s16 q3, q3, q0
|
||||
vpadd.i16 d4, d4, d5
|
||||
vpadd.i16 d6, d6, d7
|
||||
vpaddl.s16 d4, d4 @ d4[0] = H
|
||||
vpaddl.s16 d6, d6
|
||||
vpadd.s32 d6, d6 @ d6[0] = V
|
||||
vshl.i32 d5, d4, #4
|
||||
vadd.s32 d4, d4, d5 @ d4[0] = 17*H
|
||||
vshl.i32 d7, d6, #2
|
||||
vrshrn.s32 d4, q2, #5 @ d4[0] = b
|
||||
vadd.s32 d6, d6, d7 @ d6[0] = 5*V
|
||||
vrshrn.s32 d6, q3, #6 @ d6[0] = c
|
||||
mov r3, #0
|
||||
vshl.i16 d3, d4, #2
|
||||
vsub.i16 d3, d3, d4 @ d2[0] = 3 * b
|
||||
vshl.i16 d2, d6, #3
|
||||
vadd.i16 d3, d3, d2 @ d2[0] = 3 * b + 8 * c
|
||||
vsub.i16 d3, d3, d6 @ d2[0] = 3 * b + 7 * c
|
||||
vrev64.16 d16, d16
|
||||
vadd.i16 d16, d16, d0 @ d16[0] = src[]+src[] + 1
|
||||
vshl.i16 d2, d16, #4 @ d3[0] = a + 16
|
||||
vsub.i16 d2, d2, d3 @ i00
|
||||
vext.16 q0, q0, q0, #7
|
||||
vmov.16 d0[0], r3
|
||||
vmul.i16 q0, q0, d4[0]
|
||||
vdup.16 q1, d2[0]
|
||||
vdup.16 q3, d6[0]
|
||||
vadd.i16 q1, q1, q0
|
||||
mov r3, #16
|
||||
1:
|
||||
vqshrun.s16 d0, q1, #5
|
||||
vadd.i16 q1, q1, q3
|
||||
vst1.8 {d0}, [r0,:64], r1
|
||||
subs r3, r3, #1
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
|
||||
function predict_16x16_dc_top_neon
|
||||
sub r2, r0, #FDEC_STRIDE
|
||||
mov r1, #FDEC_STRIDE
|
||||
vld1.8 {q0}, [r2,:128]
|
||||
add16x8 q0, d0, d1, d0, d1
|
||||
vrshrn.u16 d0, q0, #4
|
||||
vdup.8 q0, d0[0]
|
||||
b pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function predict_16x16_dc_left_neon
|
||||
mov r1, #FDEC_STRIDE
|
||||
sub r2, r0, #1
|
||||
ldcol.8 d0, r2, r1
|
||||
ldcol.8 d1, r2, r1
|
||||
add16x8 q0, d0, d1, d0, d1
|
||||
vrshrn.u16 d0, q0, #4
|
||||
vdup.8 q0, d0[0]
|
||||
b pred16x16_dc_end
|
||||
endfunc
|
||||
|
||||
function predict_16x16_dc_neon
|
||||
sub r3, r0, #FDEC_STRIDE
|
||||
sub r0, r0, #1
|
||||
vld1.64 {d0-d1}, [r3,:128]
|
||||
ldrb ip, [r0], #FDEC_STRIDE
|
||||
vaddl.u8 q0, d0, d1
|
||||
ldrb r1, [r0], #FDEC_STRIDE
|
||||
vadd.u16 d0, d0, d1
|
||||
vpadd.u16 d0, d0, d0
|
||||
vpadd.u16 d0, d0, d0
|
||||
.rept 4
|
||||
ldrb r2, [r0], #FDEC_STRIDE
|
||||
add ip, ip, r1
|
||||
ldrb r3, [r0], #FDEC_STRIDE
|
||||
add ip, ip, r2
|
||||
ldrb r1, [r0], #FDEC_STRIDE
|
||||
add ip, ip, r3
|
||||
.endr
|
||||
ldrb r2, [r0], #FDEC_STRIDE
|
||||
add ip, ip, r1
|
||||
ldrb r3, [r0], #FDEC_STRIDE
|
||||
add ip, ip, r2
|
||||
|
||||
sub r0, r0, #FDEC_STRIDE*16
|
||||
add ip, ip, r3
|
||||
vdup.16 d1, ip
|
||||
vadd.u16 d0, d0, d1
|
||||
mov r1, #FDEC_STRIDE
|
||||
add r0, r0, #1
|
||||
vrshr.u16 d0, d0, #5
|
||||
vdup.8 q0, d0[0]
|
||||
pred16x16_dc_end:
|
||||
.rept 16
|
||||
vst1.64 {d0-d1}, [r0,:128], r1
|
||||
.endr
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_16x16_h_neon
|
||||
sub r1, r0, #1
|
||||
mov ip, #FDEC_STRIDE
|
||||
.rept 8
|
||||
vld1.8 {d0[]}, [r1], ip
|
||||
vmov d1, d0
|
||||
vld1.8 {d2[]}, [r1], ip
|
||||
vmov d3, d2
|
||||
vst1.64 {d0-d1}, [r0,:128], ip
|
||||
vst1.64 {d2-d3}, [r0,:128], ip
|
||||
.endr
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_16x16_v_neon
|
||||
sub r0, r0, #FDEC_STRIDE
|
||||
mov ip, #FDEC_STRIDE
|
||||
vld1.64 {d0-d1}, [r0,:128], ip
|
||||
.rept 16
|
||||
vst1.64 {d0-d1}, [r0,:128], ip
|
||||
.endr
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function predict_16x16_p_neon
|
||||
sub r3, r0, #FDEC_STRIDE
|
||||
mov r1, #FDEC_STRIDE
|
||||
add r2, r3, #8
|
||||
sub r3, r3, #1
|
||||
vld1.8 {d0}, [r3]
|
||||
vld1.8 {d2}, [r2,:64], r1
|
||||
ldcol.8 d1, r3, r1
|
||||
add r3, r3, r1
|
||||
ldcol.8 d3, r3, r1
|
||||
vrev64.8 q0, q0
|
||||
vaddl.u8 q8, d2, d3
|
||||
vsubl.u8 q2, d2, d0
|
||||
vsubl.u8 q3, d3, d1
|
||||
movrel r3, p16weight
|
||||
vld1.8 {q0}, [r3,:128]
|
||||
vmul.s16 q2, q2, q0
|
||||
vmul.s16 q3, q3, q0
|
||||
vadd.i16 d4, d4, d5
|
||||
vadd.i16 d5, d6, d7
|
||||
vpadd.i16 d4, d4, d5
|
||||
vpadd.i16 d4, d4, d4
|
||||
vshll.s16 q3, d4, #2
|
||||
vaddw.s16 q2, q3, d4
|
||||
vrshrn.s32 d4, q2, #6
|
||||
mov r3, #0
|
||||
vtrn.16 d4, d5
|
||||
vadd.i16 d2, d4, d5
|
||||
vshl.i16 d3, d2, #3
|
||||
vrev64.16 d16, d17
|
||||
vsub.i16 d3, d3, d2
|
||||
vadd.i16 d16, d16, d0
|
||||
vshl.i16 d2, d16, #4
|
||||
vsub.i16 d2, d2, d3
|
||||
vshl.i16 d3, d4, #4
|
||||
vext.16 q0, q0, q0, #7
|
||||
vsub.i16 d6, d5, d3
|
||||
vmov.16 d0[0], r3
|
||||
vmul.i16 q0, q0, d4[0]
|
||||
vdup.16 q1, d2[0]
|
||||
vdup.16 q2, d4[0]
|
||||
vdup.16 q3, d6[0]
|
||||
vshl.i16 q2, q2, #3
|
||||
vadd.i16 q1, q1, q0
|
||||
vadd.i16 q3, q3, q2
|
||||
mov r3, #16
|
||||
1:
|
||||
vqshrun.s16 d0, q1, #5
|
||||
vadd.i16 q1, q1, q2
|
||||
vqshrun.s16 d1, q1, #5
|
||||
vadd.i16 q1, q1, q3
|
||||
vst1.8 {q0}, [r0,:128], r1
|
||||
subs r3, r3, #1
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
108
common/arm/predict-c.c
Normal file
108
common/arm/predict-c.c
Normal file
@@ -0,0 +1,108 @@
|
||||
/*****************************************************************************
|
||||
* predict.c: arm intra prediction
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common/common.h"
|
||||
#include "predict.h"
|
||||
#include "pixel.h"
|
||||
|
||||
void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] )
|
||||
{
|
||||
if( !(cpu&X264_CPU_ARMV6) )
|
||||
return;
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6;
|
||||
pf[I_PRED_4x4_V] = x264_predict_4x4_v_armv6;
|
||||
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6;
|
||||
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
|
||||
|
||||
if( !(cpu&X264_CPU_NEON) )
|
||||
return;
|
||||
|
||||
pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
|
||||
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
}
|
||||
|
||||
void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
|
||||
{
|
||||
if( !(cpu&X264_CPU_NEON) )
|
||||
return;
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
|
||||
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
|
||||
pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
|
||||
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
|
||||
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
}
|
||||
|
||||
void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] )
|
||||
{
|
||||
if( !(cpu&X264_CPU_NEON) )
|
||||
return;
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
/* The other functions weren't faster than C (gcc 4.7.3) on Cortex A8 and A9. */
|
||||
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_neon;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_neon;
|
||||
pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_neon;
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
}
|
||||
|
||||
void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
|
||||
{
|
||||
if( !(cpu&X264_CPU_NEON) )
|
||||
return;
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
|
||||
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
|
||||
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon;
|
||||
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon;
|
||||
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
|
||||
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
|
||||
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon;
|
||||
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon;
|
||||
pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon;
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
}
|
||||
|
||||
void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] )
|
||||
{
|
||||
if( !(cpu&X264_CPU_NEON) )
|
||||
return;
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
|
||||
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
|
||||
pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
|
||||
pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
|
||||
pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
|
||||
pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon;
|
||||
#endif // !HIGH_BIT_DEPTH
|
||||
}
|
||||
105
common/arm/predict.h
Normal file
105
common/arm/predict.h
Normal file
@@ -0,0 +1,105 @@
|
||||
/*****************************************************************************
|
||||
* predict.h: arm intra prediction
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_ARM_PREDICT_H
|
||||
#define X264_ARM_PREDICT_H
|
||||
|
||||
#define x264_predict_4x4_dc_armv6 x264_template(predict_4x4_dc_armv6)
|
||||
void x264_predict_4x4_dc_armv6( uint8_t *src );
|
||||
#define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
|
||||
void x264_predict_4x4_dc_top_neon( uint8_t *src );
|
||||
#define x264_predict_4x4_v_armv6 x264_template(predict_4x4_v_armv6)
|
||||
void x264_predict_4x4_v_armv6( uint8_t *src );
|
||||
#define x264_predict_4x4_h_armv6 x264_template(predict_4x4_h_armv6)
|
||||
void x264_predict_4x4_h_armv6( uint8_t *src );
|
||||
#define x264_predict_4x4_ddr_armv6 x264_template(predict_4x4_ddr_armv6)
|
||||
void x264_predict_4x4_ddr_armv6( uint8_t *src );
|
||||
#define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
|
||||
void x264_predict_4x4_ddl_neon( uint8_t *src );
|
||||
|
||||
#define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
|
||||
void x264_predict_8x8c_dc_neon( uint8_t *src );
|
||||
#define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
|
||||
void x264_predict_8x8c_dc_top_neon( uint8_t *src );
|
||||
#define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
|
||||
void x264_predict_8x8c_dc_left_neon( uint8_t *src );
|
||||
#define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
|
||||
void x264_predict_8x8c_h_neon( uint8_t *src );
|
||||
#define x264_predict_8x8c_v_neon x264_template(predict_8x8c_v_neon)
|
||||
void x264_predict_8x8c_v_neon( uint8_t *src );
|
||||
#define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
|
||||
void x264_predict_8x8c_p_neon( uint8_t *src );
|
||||
|
||||
#define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
|
||||
void x264_predict_8x16c_h_neon( uint8_t *src );
|
||||
#define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
|
||||
void x264_predict_8x16c_dc_top_neon( uint8_t *src );
|
||||
#define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
|
||||
void x264_predict_8x16c_p_neon( uint8_t *src );
|
||||
|
||||
#define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
|
||||
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
|
||||
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
|
||||
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
|
||||
void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
|
||||
void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
|
||||
void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
|
||||
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
|
||||
void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
|
||||
void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
|
||||
|
||||
#define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
|
||||
void x264_predict_16x16_dc_neon( uint8_t *src );
|
||||
#define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
|
||||
void x264_predict_16x16_dc_top_neon( uint8_t *src );
|
||||
#define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
|
||||
void x264_predict_16x16_dc_left_neon( uint8_t *src );
|
||||
#define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
|
||||
void x264_predict_16x16_h_neon( uint8_t *src );
|
||||
#define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
|
||||
void x264_predict_16x16_v_neon( uint8_t *src );
|
||||
#define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
|
||||
void x264_predict_16x16_p_neon( uint8_t *src );
|
||||
|
||||
#define x264_predict_4x4_init_arm x264_template(predict_4x4_init_arm)
|
||||
void x264_predict_4x4_init_arm( uint32_t cpu, x264_predict_t pf[12] );
|
||||
#define x264_predict_8x8_init_arm x264_template(predict_8x8_init_arm)
|
||||
void x264_predict_8x8_init_arm( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
|
||||
#define x264_predict_8x8c_init_arm x264_template(predict_8x8c_init_arm)
|
||||
void x264_predict_8x8c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
|
||||
#define x264_predict_8x16c_init_arm x264_template(predict_8x16c_init_arm)
|
||||
void x264_predict_8x16c_init_arm( uint32_t cpu, x264_predict_t pf[7] );
|
||||
#define x264_predict_16x16_init_arm x264_template(predict_16x16_init_arm)
|
||||
void x264_predict_16x16_init_arm( uint32_t cpu, x264_predict_t pf[7] );
|
||||
|
||||
#endif
|
||||
574
common/arm/quant-a.S
Normal file
574
common/arm/quant-a.S
Normal file
@@ -0,0 +1,574 @@
|
||||
/****************************************************************************
|
||||
* quant.S: arm quantization and level-run
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2009-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
* Janne Grunau <janne-x264@jannau.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "asm.S"
|
||||
|
||||
const pmovmskb_byte, align=4
|
||||
.byte 1,2,4,8,16,32,64,128
|
||||
.byte 1,2,4,8,16,32,64,128
|
||||
endconst
|
||||
|
||||
const mask_2bit, align=4
|
||||
.byte 3,12,48,192,3,12,48,192
|
||||
.byte 3,12,48,192,3,12,48,192
|
||||
endconst
|
||||
|
||||
const mask_1bit, align=4
|
||||
.byte 128,64,32,16,8,4,2,1
|
||||
.byte 128,64,32,16,8,4,2,1
|
||||
endconst
|
||||
|
||||
.text
|
||||
|
||||
.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
|
||||
vadd.u16 q8, q8, \bias0
|
||||
vadd.u16 q9, q9, \bias1
|
||||
.ifc \load_mf, yes
|
||||
vld1.64 {\mf0-\mf3}, [r1,:128]!
|
||||
.endif
|
||||
vmull.u16 q10, d16, \mf0
|
||||
vmull.u16 q11, d17, \mf1
|
||||
vmull.u16 q12, d18, \mf2
|
||||
vmull.u16 q13, d19, \mf3
|
||||
vshr.s16 q14, q14, #15
|
||||
vshr.s16 q15, q15, #15
|
||||
vshrn.u32 d16, q10, #16
|
||||
vshrn.u32 d17, q11, #16
|
||||
vshrn.u32 d18, q12, #16
|
||||
vshrn.u32 d19, q13, #16
|
||||
veor q8, q8, q14
|
||||
veor q9, q9, q15
|
||||
vsub.s16 q8, q8, q14
|
||||
vsub.s16 q9, q9, q15
|
||||
vorr \mask, q8, q9
|
||||
vst1.64 {d16-d19}, [r0,:128]!
|
||||
.endm
|
||||
|
||||
.macro QUANT_END d
|
||||
vmov r2, r3, \d
|
||||
orrs r0, r2, r3
|
||||
movne r0, #1
|
||||
bx lr
|
||||
.endm
|
||||
|
||||
// quant_2x2_dc( int16_t dct[4], int mf, int bias )
|
||||
function quant_2x2_dc_neon
|
||||
vld1.64 {d0}, [r0,:64]
|
||||
vabs.s16 d3, d0
|
||||
vdup.16 d2, r2
|
||||
vdup.16 d1, r1
|
||||
vadd.u16 d3, d3, d2
|
||||
vmull.u16 q3, d3, d1
|
||||
vshr.s16 d0, d0, #15
|
||||
vshrn.u32 d3, q3, #16
|
||||
veor d3, d3, d0
|
||||
vsub.s16 d3, d3, d0
|
||||
vst1.64 {d3}, [r0,:64]
|
||||
QUANT_END d3
|
||||
endfunc
|
||||
|
||||
// quant_4x4_dc( int16_t dct[16], int mf, int bias )
|
||||
function quant_4x4_dc_neon
|
||||
vld1.64 {d28-d31}, [r0,:128]
|
||||
vabs.s16 q8, q14
|
||||
vabs.s16 q9, q15
|
||||
vdup.16 q0, r2
|
||||
vdup.16 q2, r1
|
||||
QUANT_TWO q0, q0, d4, d5, d4, d5, q0
|
||||
vorr d0, d0, d1
|
||||
QUANT_END d0
|
||||
endfunc
|
||||
|
||||
// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
|
||||
function quant_4x4_neon
|
||||
vld1.64 {d28-d31}, [r0,:128]
|
||||
vabs.s16 q8, q14
|
||||
vabs.s16 q9, q15
|
||||
vld1.64 {d0-d3}, [r2,:128]
|
||||
vld1.64 {d4-d7}, [r1,:128]
|
||||
QUANT_TWO q0, q1, d4, d5, d6, d7, q0
|
||||
vorr d0, d0, d1
|
||||
QUANT_END d0
|
||||
endfunc
|
||||
|
||||
// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
|
||||
function quant_4x4x4_neon
|
||||
vpush {d8-d15}
|
||||
vld1.64 {d28-d31}, [r0,:128]
|
||||
vabs.s16 q8, q14
|
||||
vabs.s16 q9, q15
|
||||
vld1.64 {d0-d3}, [r2,:128]
|
||||
vld1.64 {d4-d7}, [r1,:128]
|
||||
QUANT_TWO q0, q1, d4, d5, d6, d7, q4
|
||||
vld1.64 {d28-d31}, [r0,:128]
|
||||
vabs.s16 q8, q14
|
||||
vabs.s16 q9, q15
|
||||
QUANT_TWO q0, q1, d4, d5, d6, d7, q5
|
||||
vld1.64 {d28-d31}, [r0,:128]
|
||||
vabs.s16 q8, q14
|
||||
vabs.s16 q9, q15
|
||||
QUANT_TWO q0, q1, d4, d5, d6, d7, q6
|
||||
vld1.64 {d28-d31}, [r0,:128]
|
||||
vabs.s16 q8, q14
|
||||
vabs.s16 q9, q15
|
||||
QUANT_TWO q0, q1, d4, d5, d6, d7, q7
|
||||
vorr d8, d8, d9
|
||||
vorr d10, d10, d11
|
||||
vorr d12, d12, d13
|
||||
vorr d14, d14, d15
|
||||
vmov r0, r1, d8
|
||||
vmov r2, r3, d10
|
||||
orrs r0, r1
|
||||
movne r0, #1
|
||||
orrs r2, r3
|
||||
orrne r0, #2
|
||||
vmov r1, r2, d12
|
||||
vmov r3, ip, d14
|
||||
orrs r1, r2
|
||||
orrne r0, #4
|
||||
orrs r3, ip
|
||||
orrne r0, #8
|
||||
vpop {d8-d15}
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
|
||||
function quant_8x8_neon
|
||||
vld1.64 {d28-d31}, [r0,:128]
|
||||
vabs.s16 q8, q14
|
||||
vabs.s16 q9, q15
|
||||
vld1.64 {d0-d3}, [r2,:128]!
|
||||
vld1.64 {d4-d7}, [r1,:128]!
|
||||
QUANT_TWO q0, q1, d4, d5, d6, d7, q0
|
||||
.rept 3
|
||||
vld1.64 {d28-d31}, [r0,:128]
|
||||
vabs.s16 q8, q14
|
||||
vabs.s16 q9, q15
|
||||
vld1.64 {d2-d5}, [r2,:128]!
|
||||
QUANT_TWO q1, q2, d4, d5, d6, d7, q1, yes
|
||||
vorr q0, q0, q1
|
||||
.endr
|
||||
vorr d0, d0, d1
|
||||
QUANT_END d0
|
||||
endfunc
|
||||
|
||||
.macro DEQUANT_START mf_size offset dc=no
|
||||
mov r3, #0x2b
|
||||
mul r3, r3, r2
|
||||
lsr r3, r3, #8 // i_qbits = i_qp / 6
|
||||
add ip, r3, r3, lsl #1
|
||||
sub r2, r2, ip, lsl #1 // i_mf = i_qp % 6
|
||||
.ifc \dc,no
|
||||
add r1, r1, r2, lsl #\mf_size // dequant_mf[i_mf]
|
||||
.else
|
||||
ldr r1, [r1, r2, lsl #\mf_size] // dequant_mf[i_mf][0][0]
|
||||
.endif
|
||||
subs r3, r3, #\offset // 6 for 8x8
|
||||
.endm
|
||||
|
||||
// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
|
||||
.macro DEQUANT size bits
|
||||
function dequant_\size\()_neon
|
||||
DEQUANT_START \bits+2, \bits
|
||||
.ifc \size, 8x8
|
||||
mov r2, #4
|
||||
.endif
|
||||
blt dequant_\size\()_rshift
|
||||
|
||||
vdup.16 q15, r3
|
||||
dequant_\size\()_lshift_loop:
|
||||
.ifc \size, 8x8
|
||||
subs r2, r2, #1
|
||||
.endif
|
||||
vld1.32 {d16-d17}, [r1,:128]!
|
||||
vld1.32 {d18-d19}, [r1,:128]!
|
||||
vmovn.s32 d4, q8
|
||||
vld1.32 {d20-d21}, [r1,:128]!
|
||||
vmovn.s32 d5, q9
|
||||
vld1.32 {d22-d23}, [r1,:128]!
|
||||
vmovn.s32 d6, q10
|
||||
vld1.16 {d0-d3}, [r0,:128]
|
||||
vmovn.s32 d7, q11
|
||||
vmul.s16 q0, q0, q2
|
||||
vmul.s16 q1, q1, q3
|
||||
vshl.s16 q0, q0, q15
|
||||
vshl.s16 q1, q1, q15
|
||||
vst1.16 {d0-d3}, [r0,:128]!
|
||||
.ifc \size, 8x8
|
||||
bgt dequant_\size\()_lshift_loop
|
||||
.endif
|
||||
bx lr
|
||||
|
||||
dequant_\size\()_rshift:
|
||||
vdup.32 q15, r3
|
||||
rsb r3, r3, #0
|
||||
mov ip, #1
|
||||
sub r3, r3, #1
|
||||
lsl ip, ip, r3
|
||||
|
||||
.ifc \size, 8x8
|
||||
dequant_\size\()_rshift_loop:
|
||||
subs r2, r2, #1
|
||||
.endif
|
||||
vdup.32 q10, ip
|
||||
vld1.32 {d16-d17}, [r1,:128]!
|
||||
vdup.32 q11, ip
|
||||
vld1.32 {d18-d19}, [r1,:128]!
|
||||
vmovn.s32 d4, q8
|
||||
vld1.32 {d16-d17}, [r1,:128]!
|
||||
vmovn.s32 d5, q9
|
||||
vld1.32 {d18-d19}, [r1,:128]!
|
||||
vmovn.s32 d6, q8
|
||||
vld1.16 {d0-d3}, [r0,:128]
|
||||
vmovn.s32 d7, q9
|
||||
vdup.32 q12, ip
|
||||
vdup.32 q13, ip
|
||||
|
||||
vmlal.s16 q10, d0, d4
|
||||
vmlal.s16 q11, d1, d5
|
||||
vmlal.s16 q12, d2, d6
|
||||
vmlal.s16 q13, d3, d7
|
||||
vshl.s32 q10, q10, q15
|
||||
vshl.s32 q11, q11, q15
|
||||
vshl.s32 q12, q12, q15
|
||||
vshl.s32 q13, q13, q15
|
||||
|
||||
vmovn.s32 d0, q10
|
||||
vmovn.s32 d1, q11
|
||||
vmovn.s32 d2, q12
|
||||
vmovn.s32 d3, q13
|
||||
vst1.16 {d0-d3}, [r0,:128]!
|
||||
.ifc \size, 8x8
|
||||
bgt dequant_\size\()_rshift_loop
|
||||
.endif
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
DEQUANT 4x4, 4
|
||||
DEQUANT 8x8, 6
|
||||
|
||||
// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
|
||||
function dequant_4x4_dc_neon
|
||||
DEQUANT_START 6, 6, yes
|
||||
blt dequant_4x4_dc_rshift
|
||||
|
||||
lsl r1, r1, r3
|
||||
vdup.16 q2, r1
|
||||
vld1.16 {d0-d3}, [r0,:128]
|
||||
vdup.16 q15, r3
|
||||
|
||||
vmul.s16 q0, q0, q2
|
||||
vmul.s16 q1, q1, q2
|
||||
vst1.16 {d0-d3}, [r0,:128]
|
||||
bx lr
|
||||
|
||||
dequant_4x4_dc_rshift:
|
||||
vdup.16 d4, r1
|
||||
vdup.32 q15, r3
|
||||
rsb r3, r3, #0
|
||||
mov ip, #1
|
||||
sub r3, r3, #1
|
||||
lsl ip, ip, r3
|
||||
|
||||
vdup.32 q10, ip
|
||||
vdup.32 q11, ip
|
||||
vld1.16 {d0-d3}, [r0,:128]
|
||||
vdup.32 q12, ip
|
||||
vdup.32 q13, ip
|
||||
|
||||
vmlal.s16 q10, d0, d4
|
||||
vmlal.s16 q11, d1, d4
|
||||
vmlal.s16 q12, d2, d4
|
||||
vmlal.s16 q13, d3, d4
|
||||
vshl.s32 q10, q10, q15
|
||||
vshl.s32 q11, q11, q15
|
||||
vshl.s32 q12, q12, q15
|
||||
vshl.s32 q13, q13, q15
|
||||
|
||||
vmovn.s32 d0, q10
|
||||
vmovn.s32 d1, q11
|
||||
vmovn.s32 d2, q12
|
||||
vmovn.s32 d3, q13
|
||||
vst1.16 {d0-d3}, [r0,:128]
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro decimate_score_1x size
|
||||
function decimate_score\size\()_neon
|
||||
vld1.16 {q0, q1}, [r0, :128]
|
||||
movrel r3, mask_2bit
|
||||
vmov.s8 q3, #0x01
|
||||
vqmovn.s16 d0, q0
|
||||
vqmovn.s16 d1, q1
|
||||
vqabs.s8 q2, q0
|
||||
vld1.8 {q8}, [r3, :128]
|
||||
vceq.s8 q1, q0, #0
|
||||
vcgt.s8 q2, q2, q3
|
||||
vand.u8 q1, q1, q8
|
||||
vshrn.u16 d4, q2, #4
|
||||
vpadd.u8 d2, d2, d3
|
||||
vpadd.u8 d4, d4, d4
|
||||
vpadd.u8 d2, d2, d2
|
||||
vmov.32 r2, d4[0]
|
||||
vmov.32 r1, d2[0]
|
||||
cmp r2, #0
|
||||
beq 0f
|
||||
mov r0, #9
|
||||
bx lr
|
||||
0:
|
||||
mvns r1, r1
|
||||
mov r0, #0
|
||||
bxeq lr
|
||||
.ifc \size, 15
|
||||
lsr r1, r1, #2
|
||||
.endif
|
||||
rbit r1, r1
|
||||
movrelx r3, X264(decimate_table4), r2
|
||||
1:
|
||||
clz r2, r1
|
||||
lsl r1, r1, r2
|
||||
lsr r12, r2, #1
|
||||
ldrb r2, [r3, r12]
|
||||
lsls r1, r1, #2
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
decimate_score_1x 15
|
||||
decimate_score_1x 16
|
||||
|
||||
function decimate_score64_neon
|
||||
push {lr}
|
||||
vld1.16 {q8, q9}, [r0, :128]!
|
||||
vld1.16 {q10, q11}, [r0, :128]!
|
||||
vld1.16 {q12, q13}, [r0, :128]!
|
||||
vld1.16 {q14, q15}, [r0, :128]
|
||||
movrel r3, mask_1bit
|
||||
vmov.s8 q3, #0x01
|
||||
vqmovn.s16 d17, q8
|
||||
vqmovn.s16 d16, q9
|
||||
vqmovn.s16 d19, q10
|
||||
vqmovn.s16 d18, q11
|
||||
vqmovn.s16 d21, q12
|
||||
vqmovn.s16 d20, q13
|
||||
vqmovn.s16 d23, q14
|
||||
vqmovn.s16 d22, q15
|
||||
vqabs.s8 q12, q8
|
||||
vqabs.s8 q13, q9
|
||||
vqabs.s8 q14, q10
|
||||
vqabs.s8 q15, q11
|
||||
vld1.8 {q2}, [r3, :128]
|
||||
vceq.s8 q8, q8, #0
|
||||
vceq.s8 q9, q9, #0
|
||||
vceq.s8 q10, q10, #0
|
||||
vceq.s8 q11, q11, #0
|
||||
vmax.s8 q12, q12, q13
|
||||
vmax.s8 q14, q14, q15
|
||||
vand.u8 q8, q8, q2
|
||||
vand.u8 q9, q9, q2
|
||||
vand.u8 q10, q10, q2
|
||||
vand.u8 q11, q11, q2
|
||||
vmax.s8 q12, q12, q14
|
||||
vpadd.u8 d18, d18, d19
|
||||
vpadd.u8 d19, d16, d17
|
||||
vcgt.s8 q12, q12, q3
|
||||
vpadd.u8 d22, d22, d23
|
||||
vpadd.u8 d23, d20, d21
|
||||
vshrn.u16 d24, q12, #4
|
||||
vpadd.u8 d16, d22, d23
|
||||
vpadd.u8 d17, d18, d19
|
||||
vpadd.u8 d24, d24, d24
|
||||
vpadd.u8 d16, d16, d17
|
||||
vmov.32 r2, d24[0]
|
||||
vmov r12, r1, d16
|
||||
cmp r2, #0
|
||||
beq 0f
|
||||
mov r0, #9
|
||||
pop {pc}
|
||||
0:
|
||||
mvns r1, r1
|
||||
mvn r12, r12
|
||||
mov r0, #0
|
||||
mov lr, #32
|
||||
movrelx r3, X264(decimate_table8), r2
|
||||
beq 2f
|
||||
1:
|
||||
clz r2, r1
|
||||
lsl r1, r1, r2
|
||||
sub lr, lr, r2
|
||||
ldrb r2, [r3, r2]
|
||||
lsls r1, r1, #1
|
||||
sub lr, lr, #1
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
2:
|
||||
cmp r12, #0
|
||||
popeq {pc}
|
||||
|
||||
clz r2, r12
|
||||
lsl r1, r12, r2
|
||||
add r2, r2, lr
|
||||
ldrb r2, [r3, r2]
|
||||
lsls r1, r1, #1
|
||||
add r0, r0, r2
|
||||
popeq {pc}
|
||||
3:
|
||||
clz r2, r1
|
||||
lsl r1, r1, r2
|
||||
ldrb r2, [r3, r2]
|
||||
lsls r1, r1, #1
|
||||
add r0, r0, r2
|
||||
bne 3b
|
||||
pop {pc}
|
||||
endfunc
|
||||
|
||||
// int coeff_last( int16_t *l )
|
||||
function coeff_last4_arm
|
||||
ldrd r2, r3, [r0]
|
||||
subs r0, r3, #0
|
||||
movne r0, #2
|
||||
movne r2, r3
|
||||
lsrs r2, r2, #16
|
||||
addne r0, r0, #1
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function coeff_last8_arm
|
||||
ldrd r2, r3, [r0, #8]
|
||||
orrs ip, r2, r3
|
||||
movne r0, #4
|
||||
ldrdeq r2, r3, [r0]
|
||||
moveq r0, #0
|
||||
tst r3, r3
|
||||
addne r0, #2
|
||||
movne r2, r3
|
||||
lsrs r2, r2, #16
|
||||
addne r0, r0, #1
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
.macro COEFF_LAST_1x size
|
||||
function coeff_last\size\()_neon
|
||||
.if \size == 15
|
||||
sub r0, r0, #2
|
||||
.endif
|
||||
vld1.64 {d0-d3}, [r0,:128]
|
||||
vtst.16 q0, q0
|
||||
vtst.16 q1, q1
|
||||
vshrn.u16 d0, q0, #8
|
||||
vshrn.u16 d1, q1, #8
|
||||
vshrn.u16 d0, q0, #4
|
||||
vclz.i32 d0, d0
|
||||
mov ip, #7
|
||||
mov r3, #\size - 9
|
||||
vmov r0, r1, d0
|
||||
|
||||
subs r1, ip, r1, lsr #2
|
||||
addge r0, r1, #\size - 8
|
||||
subslt r0, r3, r0, lsr #2
|
||||
movlt r0, #0
|
||||
bx lr
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
COEFF_LAST_1x 15
|
||||
COEFF_LAST_1x 16
|
||||
|
||||
function coeff_last64_neon
|
||||
vld1.64 {d16-d19}, [r0,:128]!
|
||||
vqmovn.u16 d16, q8
|
||||
vqmovn.u16 d17, q9
|
||||
vld1.64 {d20-d23}, [r0,:128]!
|
||||
vqmovn.u16 d18, q10
|
||||
vqmovn.u16 d19, q11
|
||||
vld1.64 {d24-d27}, [r0,:128]!
|
||||
vqmovn.u16 d20, q12
|
||||
vqmovn.u16 d21, q13
|
||||
vld1.64 {d28-d31}, [r0,:128]!
|
||||
vqmovn.u16 d22, q14
|
||||
vqmovn.u16 d23, q15
|
||||
|
||||
movrel r1, pmovmskb_byte
|
||||
vld1.64 {d0-d1}, [r1,:128]
|
||||
|
||||
vtst.8 q8, q8
|
||||
vtst.8 q9, q9
|
||||
vtst.8 q10, q10
|
||||
vtst.8 q11, q11
|
||||
|
||||
vand q8, q8, q0
|
||||
vand q9, q9, q0
|
||||
vand q10, q10, q0
|
||||
vand q11, q11, q0
|
||||
|
||||
vpadd.u8 d0, d16, d17
|
||||
vpadd.u8 d1, d18, d19
|
||||
vpadd.u8 d2, d20, d21
|
||||
vpadd.u8 d3, d22, d23
|
||||
vpadd.u8 d0, d0, d1
|
||||
vpadd.u8 d1, d2, d3
|
||||
vpadd.u8 d0, d0, d1
|
||||
vclz.i32 d0, d0
|
||||
mov ip, #31
|
||||
vmov r0, r1, d0
|
||||
|
||||
subs r1, ip, r1
|
||||
addge r0, r1, #32
|
||||
subslt r0, ip, r0
|
||||
movlt r0, #0
|
||||
bx lr
|
||||
endfunc
|
||||
|
||||
function denoise_dct_neon
|
||||
1: subs r3, r3, #16
|
||||
vld1.16 {q0, q1}, [r0]
|
||||
vld1.32 {q12, q13}, [r1]!
|
||||
vld1.32 {q14, q15}, [r1]
|
||||
sub r1, #32
|
||||
vabs.s16 q8, q0
|
||||
vabs.s16 q9, q1
|
||||
vld1.16 {q2, q3}, [r2]!
|
||||
vclt.s16 q10, q0, #0
|
||||
vclt.s16 q11, q1, #0
|
||||
vaddw.u16 q12, q12, d16
|
||||
vaddw.u16 q13, q13, d17
|
||||
vqsub.u16 q0, q8, q2
|
||||
vqsub.u16 q1, q9, q3
|
||||
vaddw.u16 q14, q14, d18
|
||||
vaddw.u16 q15, q15, d19
|
||||
vneg.s16 q8, q0
|
||||
vneg.s16 q9, q1
|
||||
vbsl q10, q8, q0
|
||||
vbsl q11, q9, q1
|
||||
vst1.32 {q12, q13}, [r1]!
|
||||
vst1.32 {q14, q15}, [r1]!
|
||||
vst1.16 {q10, q11}, [r0]!
|
||||
bgt 1b
|
||||
bx lr
|
||||
endfunc
|
||||
71
common/arm/quant.h
Normal file
71
common/arm/quant.h
Normal file
@@ -0,0 +1,71 @@
|
||||
/*****************************************************************************
|
||||
* quant.h: arm quantization and level-run
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2005-2025 x264 project
|
||||
*
|
||||
* Authors: David Conrad <lessen42@gmail.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_ARM_QUANT_H
|
||||
#define X264_ARM_QUANT_H
|
||||
|
||||
#define x264_quant_2x2_dc_armv6 x264_template(quant_2x2_dc_armv6)
|
||||
int x264_quant_2x2_dc_armv6( int16_t dct[4], int mf, int bias );
|
||||
|
||||
#define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
|
||||
int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
|
||||
#define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
|
||||
int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
|
||||
#define x264_quant_4x4_neon x264_template(quant_4x4_neon)
|
||||
int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
|
||||
#define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
|
||||
int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
|
||||
#define x264_quant_8x8_neon x264_template(quant_8x8_neon)
|
||||
int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
|
||||
|
||||
#define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
|
||||
void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
|
||||
void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
|
||||
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
|
||||
|
||||
#define x264_decimate_score15_neon x264_template(decimate_score15_neon)
|
||||
int x264_decimate_score15_neon( int16_t * );
|
||||
#define x264_decimate_score16_neon x264_template(decimate_score16_neon)
|
||||
int x264_decimate_score16_neon( int16_t * );
|
||||
#define x264_decimate_score64_neon x264_template(decimate_score64_neon)
|
||||
int x264_decimate_score64_neon( int16_t * );
|
||||
|
||||
#define x264_coeff_last4_arm x264_template(coeff_last4_arm)
|
||||
int x264_coeff_last4_arm( int16_t * );
|
||||
#define x264_coeff_last8_arm x264_template(coeff_last8_arm)
|
||||
int x264_coeff_last8_arm( int16_t * );
|
||||
#define x264_coeff_last15_neon x264_template(coeff_last15_neon)
|
||||
int x264_coeff_last15_neon( int16_t * );
|
||||
#define x264_coeff_last16_neon x264_template(coeff_last16_neon)
|
||||
int x264_coeff_last16_neon( int16_t * );
|
||||
#define x264_coeff_last64_neon x264_template(coeff_last64_neon)
|
||||
int x264_coeff_last64_neon( int16_t * );
|
||||
|
||||
#define x264_denoise_dct_neon x264_template(denoise_dct_neon)
|
||||
void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user