x264 source for verification 2026-05-22
This commit is contained in:
136
common/x86/bitstream-a.asm
Normal file
136
common/x86/bitstream-a.asm
Normal file
@@ -0,0 +1,136 @@
|
||||
;*****************************************************************************
|
||||
;* bitstream-a.asm: x86 bitstream functions
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2010-2025 x264 project
|
||||
;*
|
||||
;* Authors: Fiona Glaser <fiona@x264.com>
|
||||
;* Henrik Gramner <henrik@gramner.com>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at licensing@x264.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro NAL_LOOP 2
|
||||
%%escape:
|
||||
; Detect false positive to avoid unnecessary escape loop
|
||||
xor r3d, r3d
|
||||
cmp byte [r0+r1-1], 0
|
||||
setnz r3b
|
||||
xor k3, k4
|
||||
jnz .escape
|
||||
jmp %%continue
|
||||
ALIGN 16
|
||||
%1:
|
||||
mova [r0+r1+mmsize], m1
|
||||
pcmpeqb m1, m0
|
||||
mova [r0+r1], m2
|
||||
pcmpeqb m2, m0
|
||||
pmovmskb r3d, m1
|
||||
%2 m1, [r1+r2+3*mmsize]
|
||||
pmovmskb r4d, m2
|
||||
%2 m2, [r1+r2+2*mmsize]
|
||||
shl k3, mmsize
|
||||
or k3, k4
|
||||
lea k4, [2*r3+1]
|
||||
and k4, k3
|
||||
jnz %%escape
|
||||
%%continue:
|
||||
add r1, 2*mmsize
|
||||
jl %1
|
||||
%endmacro
|
||||
|
||||
%macro NAL_ESCAPE 0
|
||||
%if mmsize == 32
|
||||
%xdefine k3 r3
|
||||
%xdefine k4 r4
|
||||
%else
|
||||
%xdefine k3 r3d
|
||||
%xdefine k4 r4d
|
||||
%endif
|
||||
|
||||
cglobal nal_escape, 3,5
|
||||
movzx r3d, byte [r1]
|
||||
sub r1, r2 ; r1 = offset of current src pointer from end of src
|
||||
pxor m0, m0
|
||||
mov [r0], r3b
|
||||
sub r0, r1 ; r0 = projected end of dst, assuming no more escapes
|
||||
or r3d, 0xffffff00 ; ignore data before src
|
||||
|
||||
; Start off by jumping into the escape loop in case there's an escape at the start.
|
||||
; And do a few more in scalar until dst is aligned.
|
||||
jmp .escape_loop
|
||||
|
||||
%if mmsize == 16
|
||||
NAL_LOOP .loop_aligned, mova
|
||||
jmp .ret
|
||||
%endif
|
||||
NAL_LOOP .loop_unaligned, movu
|
||||
.ret:
|
||||
movifnidn rax, r0
|
||||
RET
|
||||
|
||||
.escape:
|
||||
; Skip bytes that are known to be valid
|
||||
and k4, k3
|
||||
tzcnt k4, k4
|
||||
xor r3d, r3d ; the last two bytes are known to be zero
|
||||
add r1, r4
|
||||
.escape_loop:
|
||||
inc r1
|
||||
jge .ret
|
||||
movzx r4d, byte [r1+r2]
|
||||
shl r3d, 8
|
||||
or r3d, r4d
|
||||
test r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3
|
||||
jz .add_escape_byte
|
||||
.escaped:
|
||||
lea r4d, [r0+r1]
|
||||
mov [r0+r1], r3b
|
||||
test r4d, mmsize-1 ; Do SIMD when dst is aligned
|
||||
jnz .escape_loop
|
||||
movu m1, [r1+r2+mmsize]
|
||||
movu m2, [r1+r2]
|
||||
%if mmsize == 16
|
||||
lea r4d, [r1+r2]
|
||||
test r4d, mmsize-1
|
||||
jz .loop_aligned
|
||||
%endif
|
||||
jmp .loop_unaligned
|
||||
|
||||
.add_escape_byte:
|
||||
mov byte [r0+r1], 3
|
||||
inc r0
|
||||
or r3d, 0x0300
|
||||
jmp .escaped
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx2
|
||||
NAL_ESCAPE
|
||||
INIT_XMM sse2
|
||||
NAL_ESCAPE
|
||||
%if ARCH_X86_64
|
||||
INIT_YMM avx2
|
||||
NAL_ESCAPE
|
||||
%endif
|
||||
64
common/x86/bitstream.h
Normal file
64
common/x86/bitstream.h
Normal file
@@ -0,0 +1,64 @@
|
||||
/*****************************************************************************
|
||||
* bitstream.h: x86 bitstream functions
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2017-2025 x264 project
|
||||
*
|
||||
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_X86_BITSTREAM_H
|
||||
#define X264_X86_BITSTREAM_H
|
||||
|
||||
#define x264_nal_escape_mmx2 x264_template(nal_escape_mmx2)
|
||||
uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
|
||||
#define x264_nal_escape_sse2 x264_template(nal_escape_sse2)
|
||||
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
|
||||
#define x264_nal_escape_avx2 x264_template(nal_escape_avx2)
|
||||
uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end );
|
||||
#define x264_cabac_block_residual_rd_internal_sse2 x264_template(cabac_block_residual_rd_internal_sse2)
|
||||
void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_rd_internal_lzcnt x264_template(cabac_block_residual_rd_internal_lzcnt)
|
||||
void x264_cabac_block_residual_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_rd_internal_ssse3 x264_template(cabac_block_residual_rd_internal_ssse3)
|
||||
void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_rd_internal_ssse3_lzcnt x264_template(cabac_block_residual_rd_internal_ssse3_lzcnt)
|
||||
void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_rd_internal_avx512 x264_template(cabac_block_residual_rd_internal_avx512)
|
||||
void x264_cabac_block_residual_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_8x8_rd_internal_sse2 x264_template(cabac_block_residual_8x8_rd_internal_sse2)
|
||||
void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_8x8_rd_internal_lzcnt x264_template(cabac_block_residual_8x8_rd_internal_lzcnt)
|
||||
void x264_cabac_block_residual_8x8_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_8x8_rd_internal_ssse3 x264_template(cabac_block_residual_8x8_rd_internal_ssse3)
|
||||
void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt x264_template(cabac_block_residual_8x8_rd_internal_ssse3_lzcnt)
|
||||
void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_8x8_rd_internal_avx512 x264_template(cabac_block_residual_8x8_rd_internal_avx512)
|
||||
void x264_cabac_block_residual_8x8_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_internal_sse2 x264_template(cabac_block_residual_internal_sse2)
|
||||
void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_internal_lzcnt x264_template(cabac_block_residual_internal_lzcnt)
|
||||
void x264_cabac_block_residual_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_internal_avx2 x264_template(cabac_block_residual_internal_avx2)
|
||||
void x264_cabac_block_residual_internal_avx2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
#define x264_cabac_block_residual_internal_avx512 x264_template(cabac_block_residual_internal_avx512)
|
||||
void x264_cabac_block_residual_internal_avx512( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
|
||||
|
||||
#endif
|
||||
768
common/x86/cabac-a.asm
Normal file
768
common/x86/cabac-a.asm
Normal file
@@ -0,0 +1,768 @@
|
||||
;*****************************************************************************
|
||||
;* cabac-a.asm: x86 cabac
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2008-2025 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Fiona Glaser <fiona@x264.com>
|
||||
;* Holger Lubitz <holger@lubitz.org>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at licensing@x264.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION_RODATA 64
|
||||
|
||||
%if ARCH_X86_64
|
||||
%macro COEFF_LAST_TABLE 4-18 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
|
||||
%xdefine %%funccpu1 %2 ; last4
|
||||
%xdefine %%funccpu2 %3 ; last64
|
||||
%xdefine %%funccpu3 %4 ; last15/last16
|
||||
coeff_last_%1:
|
||||
%xdefine %%base coeff_last_%1
|
||||
%rep 14
|
||||
%ifidn %5, 4
|
||||
dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu1) - %%base
|
||||
%elifidn %5, 64
|
||||
dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu2) - %%base
|
||||
%else
|
||||
dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu3) - %%base
|
||||
%endif
|
||||
%rotate 1
|
||||
%endrep
|
||||
dd 0, 0 ; 64-byte alignment padding
|
||||
%endmacro
|
||||
|
||||
cextern coeff_last4_mmx2
|
||||
cextern coeff_last4_lzcnt
|
||||
%if HIGH_BIT_DEPTH
|
||||
cextern coeff_last4_avx512
|
||||
%endif
|
||||
cextern coeff_last15_sse2
|
||||
cextern coeff_last15_lzcnt
|
||||
cextern coeff_last15_avx512
|
||||
cextern coeff_last16_sse2
|
||||
cextern coeff_last16_lzcnt
|
||||
cextern coeff_last16_avx512
|
||||
cextern coeff_last64_sse2
|
||||
cextern coeff_last64_lzcnt
|
||||
cextern coeff_last64_avx2
|
||||
cextern coeff_last64_avx512
|
||||
|
||||
COEFF_LAST_TABLE sse2, mmx2, sse2, sse2
|
||||
COEFF_LAST_TABLE lzcnt, lzcnt, lzcnt, lzcnt
|
||||
COEFF_LAST_TABLE avx2, lzcnt, avx2, lzcnt
|
||||
%if HIGH_BIT_DEPTH
|
||||
COEFF_LAST_TABLE avx512, avx512, avx512, avx512
|
||||
%else
|
||||
COEFF_LAST_TABLE avx512, lzcnt, avx512, avx512
|
||||
%endif
|
||||
%endif
|
||||
|
||||
coeff_abs_level1_ctx: db 1, 2, 3, 4, 0, 0, 0, 0
|
||||
coeff_abs_levelgt1_ctx: db 5, 5, 5, 5, 6, 7, 8, 9
|
||||
coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
|
||||
db 4, 4, 4, 4, 5, 6, 7, 7
|
||||
|
||||
SECTION .text
|
||||
|
||||
cextern_common cabac_range_lps
|
||||
cextern_common cabac_transition
|
||||
cextern_common cabac_renorm_shift
|
||||
cextern_common cabac_entropy
|
||||
cextern cabac_size_unary
|
||||
cextern cabac_transition_unary
|
||||
cextern_common significant_coeff_flag_offset
|
||||
cextern_common significant_coeff_flag_offset_8x8
|
||||
cextern_common last_coeff_flag_offset
|
||||
cextern_common last_coeff_flag_offset_8x8
|
||||
cextern_common coeff_abs_level_m1_offset
|
||||
cextern_common count_cat_m1
|
||||
cextern cabac_encode_ue_bypass
|
||||
|
||||
%if ARCH_X86_64
|
||||
%define pointer resq
|
||||
%else
|
||||
%define pointer resd
|
||||
%endif
|
||||
|
||||
struc cb
|
||||
.low: resd 1
|
||||
.range: resd 1
|
||||
.queue: resd 1
|
||||
.bytes_outstanding: resd 1
|
||||
.start: pointer 1
|
||||
.p: pointer 1
|
||||
.end: pointer 1
|
||||
align 64, resb 1
|
||||
.bits_encoded: resd 1
|
||||
.state: resb 1024
|
||||
endstruc
|
||||
|
||||
%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
|
||||
%if ARCH_X86_64 == 0
|
||||
movzx %1, byte [%2+%3+%4]
|
||||
%elifidn %4, 0
|
||||
movzx %1, byte [%2+%3+r7-$$]
|
||||
%else
|
||||
lea %5, [r7+%4]
|
||||
movzx %1, byte [%2+%3+%5-$$]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro CABAC 1
|
||||
; t3 must be ecx, since it's used for shift.
|
||||
%if WIN64
|
||||
DECLARE_REG_TMP 3,1,2,0,5,6,4,4
|
||||
%elif ARCH_X86_64
|
||||
DECLARE_REG_TMP 0,1,2,3,4,5,6,6
|
||||
%else
|
||||
DECLARE_REG_TMP 0,4,2,1,3,5,6,2
|
||||
%endif
|
||||
|
||||
cglobal cabac_encode_decision_%1, 1,7
|
||||
movifnidn t1d, r1m
|
||||
mov t5d, [r0+cb.range]
|
||||
movzx t6d, byte [r0+cb.state+t1]
|
||||
movifnidn t0, r0 ; WIN64
|
||||
mov t4d, ~1
|
||||
mov t3d, t5d
|
||||
and t4d, t6d
|
||||
shr t5d, 6
|
||||
movifnidn t2d, r2m
|
||||
%if WIN64
|
||||
PUSH r7
|
||||
%endif
|
||||
%if ARCH_X86_64
|
||||
lea r7, [$$]
|
||||
%endif
|
||||
LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
|
||||
LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
|
||||
and t6d, 1
|
||||
sub t3d, t5d
|
||||
cmp t6d, t2d
|
||||
mov t6d, [t0+cb.low]
|
||||
lea t2, [t6+t3]
|
||||
cmovne t3d, t5d
|
||||
cmovne t6d, t2d
|
||||
mov [t0+cb.state+t1], t4b
|
||||
;cabac_encode_renorm
|
||||
mov t4d, t3d
|
||||
%ifidn %1, bmi2
|
||||
lzcnt t3d, t3d
|
||||
sub t3d, 23
|
||||
shlx t4d, t4d, t3d
|
||||
shlx t6d, t6d, t3d
|
||||
%else
|
||||
shr t3d, 3
|
||||
LOAD_GLOBAL t3d, cabac_renorm_shift, t3
|
||||
shl t4d, t3b
|
||||
shl t6d, t3b
|
||||
%endif
|
||||
%if WIN64
|
||||
POP r7
|
||||
%endif
|
||||
mov [t0+cb.range], t4d
|
||||
add t3d, [t0+cb.queue]
|
||||
jge cabac_putbyte_%1
|
||||
.update_queue_low:
|
||||
mov [t0+cb.low], t6d
|
||||
mov [t0+cb.queue], t3d
|
||||
RET
|
||||
|
||||
cglobal cabac_encode_bypass_%1, 2,3
|
||||
mov t7d, [r0+cb.low]
|
||||
and r1d, [r0+cb.range]
|
||||
lea t7d, [t7*2+r1]
|
||||
movifnidn t0, r0 ; WIN64
|
||||
mov t3d, [r0+cb.queue]
|
||||
inc t3d
|
||||
%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
|
||||
jge cabac_putbyte_%1
|
||||
%else
|
||||
jge .putbyte
|
||||
%endif
|
||||
mov [t0+cb.low], t7d
|
||||
mov [t0+cb.queue], t3d
|
||||
RET
|
||||
%if ARCH_X86_64 == 0
|
||||
.putbyte:
|
||||
PROLOGUE 0,7
|
||||
movifnidn t6d, t7d
|
||||
jmp cabac_putbyte_%1
|
||||
%endif
|
||||
|
||||
%ifnidn %1,bmi2
|
||||
cglobal cabac_encode_terminal_%1, 1,3
|
||||
sub dword [r0+cb.range], 2
|
||||
; shortcut: the renormalization shift in terminal
|
||||
; can only be 0 or 1 and is zero over 99% of the time.
|
||||
test dword [r0+cb.range], 0x100
|
||||
je .renorm
|
||||
RET
|
||||
.renorm:
|
||||
shl dword [r0+cb.low], 1
|
||||
shl dword [r0+cb.range], 1
|
||||
inc dword [r0+cb.queue]
|
||||
jge .putbyte
|
||||
RET
|
||||
.putbyte:
|
||||
PROLOGUE 0,7
|
||||
movifnidn t0, r0 ; WIN64
|
||||
mov t3d, [r0+cb.queue]
|
||||
mov t6d, [t0+cb.low]
|
||||
%endif
|
||||
|
||||
cabac_putbyte_%1:
|
||||
; alive: t0=cb t3=queue t6=low
|
||||
%if WIN64
|
||||
DECLARE_REG_TMP 3,6,1,0,2,5,4
|
||||
%endif
|
||||
%ifidn %1, bmi2
|
||||
add t3d, 10
|
||||
shrx t2d, t6d, t3d
|
||||
bzhi t6d, t6d, t3d
|
||||
sub t3d, 18
|
||||
%else
|
||||
mov t1d, -1
|
||||
add t3d, 10
|
||||
mov t2d, t6d
|
||||
shl t1d, t3b
|
||||
shr t2d, t3b ; out
|
||||
not t1d
|
||||
sub t3d, 18
|
||||
and t6d, t1d
|
||||
%endif
|
||||
mov t5d, [t0+cb.bytes_outstanding]
|
||||
cmp t2b, 0xff ; FIXME is a 32bit op faster?
|
||||
jz .postpone
|
||||
mov t1, [t0+cb.p]
|
||||
add [t1-1], t2h
|
||||
dec t2h
|
||||
.loop_outstanding:
|
||||
mov [t1], t2h
|
||||
inc t1
|
||||
dec t5d
|
||||
jge .loop_outstanding
|
||||
mov [t1-1], t2b
|
||||
mov [t0+cb.p], t1
|
||||
.postpone:
|
||||
inc t5d
|
||||
mov [t0+cb.bytes_outstanding], t5d
|
||||
jmp mangle(private_prefix %+ _cabac_encode_decision_%1.update_queue_low)
|
||||
%endmacro
|
||||
|
||||
CABAC asm
|
||||
CABAC bmi2
|
||||
|
||||
%if ARCH_X86_64
|
||||
; %1 = label name
|
||||
; %2 = node_ctx init?
|
||||
%macro COEFF_ABS_LEVEL_GT1 2
|
||||
%if %2
|
||||
%define ctx 1
|
||||
%else
|
||||
movzx r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL]
|
||||
%define ctx r11
|
||||
%endif
|
||||
movzx r9d, byte [r8+ctx]
|
||||
; if( coeff_abs > 1 )
|
||||
cmp r1d, 1
|
||||
jg .%1_gt1
|
||||
; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 )
|
||||
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
|
||||
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
|
||||
lea r0d, [r0+r9+256]
|
||||
mov [r8+ctx], r10b
|
||||
%if %2
|
||||
mov r2d, 1
|
||||
%else
|
||||
movzx r2d, byte [coeff_abs_level_transition+r2 GLOBAL]
|
||||
%endif
|
||||
jmp .%1_end
|
||||
|
||||
.%1_gt1:
|
||||
; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 )
|
||||
movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
|
||||
xor r9d, 1
|
||||
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
|
||||
mov [r8+ctx], r10b
|
||||
add r0d, r9d
|
||||
%if %2
|
||||
%define ctx 5
|
||||
%else
|
||||
movzx r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL]
|
||||
%define ctx r11
|
||||
%endif
|
||||
; if( coeff_abs < 15 )
|
||||
cmp r1d, 15
|
||||
jge .%1_escape
|
||||
shl r1d, 7
|
||||
; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
|
||||
movzx r9d, byte [r8+ctx]
|
||||
add r9d, r1d
|
||||
movzx r10d, byte [cabac_transition_unary-128+r9 GLOBAL]
|
||||
; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
|
||||
movzx r9d, word [cabac_size_unary-256+r9*2 GLOBAL]
|
||||
mov [r8+ctx], r10b
|
||||
add r0d, r9d
|
||||
jmp .%1_gt1_end
|
||||
|
||||
.%1_escape:
|
||||
; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]]
|
||||
movzx r9d, byte [r8+ctx]
|
||||
movzx r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL]
|
||||
; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]]
|
||||
movzx r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL]
|
||||
add r0d, r9d
|
||||
mov [r8+ctx], r10b
|
||||
sub r1d, 14
|
||||
%if cpuflag(lzcnt)
|
||||
lzcnt r9d, r1d
|
||||
xor r9d, 0x1f
|
||||
%else
|
||||
bsr r9d, r1d
|
||||
%endif
|
||||
; bs_size_ue_big(coeff_abs-15)<<8
|
||||
shl r9d, 9
|
||||
; (ilog2(coeff_abs-14)+1) << 8
|
||||
lea r0d, [r0+r9+256]
|
||||
.%1_gt1_end:
|
||||
%if %2
|
||||
mov r2d, 4
|
||||
%else
|
||||
movzx r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL]
|
||||
%endif
|
||||
.%1_end:
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_DCTCOEF 1
|
||||
%if HIGH_BIT_DEPTH
|
||||
mov %1, [dct+r6*4]
|
||||
%else
|
||||
movzx %1, word [dct+r6*2]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABS_DCTCOEFS 2
|
||||
%if HIGH_BIT_DEPTH
|
||||
%define %%abs ABSD
|
||||
%else
|
||||
%define %%abs ABSW
|
||||
%endif
|
||||
%if mmsize == %2*SIZEOF_DCTCOEF
|
||||
%%abs m0, [%1], m1
|
||||
mova [rsp], m0
|
||||
%elif mmsize == %2*SIZEOF_DCTCOEF/2
|
||||
%%abs m0, [%1+0*mmsize], m2
|
||||
%%abs m1, [%1+1*mmsize], m3
|
||||
mova [rsp+0*mmsize], m0
|
||||
mova [rsp+1*mmsize], m1
|
||||
%else
|
||||
%assign i 0
|
||||
%rep %2*SIZEOF_DCTCOEF/(4*mmsize)
|
||||
%%abs m0, [%1+(4*i+0)*mmsize], m4
|
||||
%%abs m1, [%1+(4*i+1)*mmsize], m5
|
||||
%%abs m2, [%1+(4*i+2)*mmsize], m4
|
||||
%%abs m3, [%1+(4*i+3)*mmsize], m5
|
||||
mova [rsp+(4*i+0)*mmsize], m0
|
||||
mova [rsp+(4*i+1)*mmsize], m1
|
||||
mova [rsp+(4*i+2)*mmsize], m2
|
||||
mova [rsp+(4*i+3)*mmsize], m3
|
||||
%assign i i+1
|
||||
%endrep
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SIG_OFFSET 1
|
||||
%if %1
|
||||
movzx r11d, byte [r4+r6]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LAST_OFFSET 1
|
||||
%if %1
|
||||
movzx r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro COEFF_LAST 2 ; table, ctx_block_cat
|
||||
lea r1, [%1 GLOBAL]
|
||||
movsxd r6, [r1+4*%2]
|
||||
add r6, r1
|
||||
call r6
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced,
|
||||
; int ctx_block_cat, x264_cabac_t *cb );
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
;%1 = 8x8 mode
|
||||
%macro CABAC_RESIDUAL_RD 2
|
||||
%if %1
|
||||
%define func cabac_block_residual_8x8_rd_internal
|
||||
%define maxcoeffs 64
|
||||
%define dct rsp
|
||||
%else
|
||||
%define func cabac_block_residual_rd_internal
|
||||
%define maxcoeffs 16
|
||||
%define dct r4
|
||||
%endif
|
||||
|
||||
cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
|
||||
lea r12, [$$]
|
||||
%define GLOBAL +r12-$$
|
||||
shl r1d, 4 ; MB_INTERLACED*16
|
||||
%if %1
|
||||
lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
|
||||
%endif
|
||||
add r1d, r2d
|
||||
movzx r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL] ; r5 = ctx_sig
|
||||
movzx r7d, word [last_coeff_flag_offset+r1*2 GLOBAL] ; r7 = ctx_last
|
||||
movzx r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] ; r8 = ctx_level
|
||||
|
||||
; abs() all the coefficients; copy them to the stack to avoid
|
||||
; changing the originals.
|
||||
; overreading is okay; it's all valid aligned data anyways.
|
||||
%if %1
|
||||
ABS_DCTCOEFS r0, 64
|
||||
%else
|
||||
mov r4, r0 ; r4 = dct
|
||||
and r4, ~SIZEOF_DCTCOEF ; handle AC coefficient case
|
||||
ABS_DCTCOEFS r4, 16
|
||||
xor r4, r0 ; calculate our new dct pointer
|
||||
add r4, rsp ; restore AC coefficient offset
|
||||
%endif
|
||||
; for improved OOE performance, run coeff_last on the original coefficients.
|
||||
COEFF_LAST %2, r2 ; coeff_last[ctx_block_cat]( dct )
|
||||
; we know on 64-bit that the SSE2 versions of this function only
|
||||
; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
|
||||
; don't need r2 in 8x8 mode.
|
||||
mov r0d, [r3+cb.bits_encoded] ; r0 = cabac.f8_bits_encoded
|
||||
; pre-add some values to simplify addressing
|
||||
add r3, cb.state
|
||||
add r5, r3
|
||||
add r7, r3
|
||||
add r8, r3 ; precalculate cabac state pointers
|
||||
|
||||
; if( last != count_cat_m1[ctx_block_cat] )
|
||||
%if %1
|
||||
cmp r6b, 63
|
||||
%else
|
||||
cmp r6b, [count_cat_m1+r2 GLOBAL]
|
||||
%endif
|
||||
je .skip_last_sigmap
|
||||
|
||||
; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last,
|
||||
; so we'll use r11 for this.
|
||||
%if %1
|
||||
%define siglast_ctx r11
|
||||
%else
|
||||
%define siglast_ctx r6
|
||||
%endif
|
||||
|
||||
; x264_cabac_encode_decision( cb, ctx_sig + last, 1 )
|
||||
; x264_cabac_encode_decision( cb, ctx_last + last, 1 )
|
||||
SIG_OFFSET %1
|
||||
movzx r1d, byte [r5+siglast_ctx]
|
||||
movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
|
||||
xor r1d, 1
|
||||
movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
|
||||
mov [r5+siglast_ctx], r9b
|
||||
add r0d, r1d
|
||||
|
||||
LAST_OFFSET %1
|
||||
movzx r1d, byte [r7+siglast_ctx]
|
||||
movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
|
||||
xor r1d, 1
|
||||
movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
|
||||
mov [r7+siglast_ctx], r9b
|
||||
add r0d, r1d
|
||||
.skip_last_sigmap:
|
||||
LOAD_DCTCOEF r1d
|
||||
COEFF_ABS_LEVEL_GT1 last, 1
|
||||
; for( int i = last-1 ; i >= 0; i-- )
|
||||
dec r6d
|
||||
jl .end
|
||||
.coeff_loop:
|
||||
LOAD_DCTCOEF r1d
|
||||
; if( l[i] )
|
||||
SIG_OFFSET %1
|
||||
movzx r9d, byte [r5+siglast_ctx]
|
||||
test r1d, r1d
|
||||
jnz .coeff_nonzero
|
||||
; x264_cabac_encode_decision( cb, ctx_sig + i, 0 )
|
||||
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
|
||||
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
|
||||
mov [r5+siglast_ctx], r10b
|
||||
add r0d, r9d
|
||||
dec r6d
|
||||
jge .coeff_loop
|
||||
jmp .end
|
||||
.coeff_nonzero:
|
||||
; x264_cabac_encode_decision( cb, ctx_sig + i, 1 )
|
||||
movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
|
||||
xor r9d, 1
|
||||
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
|
||||
mov [r5+siglast_ctx], r10b
|
||||
add r0d, r9d
|
||||
; x264_cabac_encode_decision( cb, ctx_last + i, 0 );
|
||||
LAST_OFFSET %1
|
||||
movzx r9d, byte [r7+siglast_ctx]
|
||||
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
|
||||
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
|
||||
mov [r7+siglast_ctx], r10b
|
||||
add r0d, r9d
|
||||
COEFF_ABS_LEVEL_GT1 coeff, 0
|
||||
dec r6d
|
||||
jge .coeff_loop
|
||||
.end:
|
||||
mov [r3+cb.bits_encoded-cb.state], r0d
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
CABAC_RESIDUAL_RD 0, coeff_last_sse2
|
||||
CABAC_RESIDUAL_RD 1, coeff_last_sse2
|
||||
INIT_XMM lzcnt
|
||||
CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
|
||||
CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
|
||||
INIT_XMM ssse3
|
||||
CABAC_RESIDUAL_RD 0, coeff_last_sse2
|
||||
CABAC_RESIDUAL_RD 1, coeff_last_sse2
|
||||
INIT_XMM ssse3,lzcnt
|
||||
CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
|
||||
CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
|
||||
%if HIGH_BIT_DEPTH
|
||||
INIT_ZMM avx512
|
||||
%else
|
||||
INIT_YMM avx512
|
||||
%endif
|
||||
CABAC_RESIDUAL_RD 0, coeff_last_avx512
|
||||
INIT_ZMM avx512
|
||||
CABAC_RESIDUAL_RD 1, coeff_last_avx512
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
|
||||
; int ctx_block_cat, x264_cabac_t *cb );
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro CALL_CABAC 0
|
||||
%if cpuflag(bmi2)
|
||||
call cabac_encode_decision_bmi2
|
||||
%else
|
||||
call cabac_encode_decision_asm
|
||||
%endif
|
||||
%if WIN64 ; move cabac back
|
||||
mov r0, r3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; %1 = 8x8 mode
|
||||
; %2 = dct register
|
||||
; %3 = countcat
|
||||
; %4 = name
|
||||
%macro SIGMAP_LOOP 3-4
|
||||
.sigmap_%4loop:
|
||||
%if HIGH_BIT_DEPTH
|
||||
mov %2, [dct+r10*4]
|
||||
%else
|
||||
movsx %2, word [dct+r10*2]
|
||||
%endif
|
||||
%if %1
|
||||
movzx r1d, byte [sigoff_8x8 + r10]
|
||||
add r1d, sigoffd
|
||||
%else
|
||||
lea r1d, [sigoffd + r10d]
|
||||
%endif
|
||||
test %2, %2
|
||||
jz .sigmap_%4zero ; if( l[i] )
|
||||
inc coeffidxd
|
||||
mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i];
|
||||
mov r2d, 1
|
||||
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );
|
||||
%if %1
|
||||
movzx r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL]
|
||||
add r1d, lastoffd
|
||||
%else
|
||||
lea r1d, [lastoffd + r10d]
|
||||
%endif
|
||||
cmp r10d, lastm ; if( i == last )
|
||||
je .sigmap_%4last
|
||||
xor r2d, r2d
|
||||
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );
|
||||
jmp .sigmap_%4loop_endcheck
|
||||
.sigmap_%4zero:
|
||||
xor r2d, r2d
|
||||
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );
|
||||
.sigmap_%4loop_endcheck:
|
||||
inc r10d
|
||||
cmp r10d, %3
|
||||
jne .sigmap_%4loop ; if( ++i == count_m1 )
|
||||
%if HIGH_BIT_DEPTH
|
||||
mov %2, [dct+r10*4]
|
||||
%else
|
||||
movsx %2, word [dct+r10*2]
|
||||
%endif
|
||||
inc coeffidxd
|
||||
mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i]
|
||||
jmp .sigmap_%4end
|
||||
.sigmap_%4last: ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );
|
||||
mov r2d, 1
|
||||
CALL_CABAC
|
||||
.sigmap_%4end:
|
||||
%if %1==0
|
||||
jmp .level_loop_start
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro CABAC_RESIDUAL 1
|
||||
cglobal cabac_block_residual_internal, 4,15,0,-4*64
|
||||
; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
|
||||
lea r7, [$$]
|
||||
%define lastm [rsp+4*1]
|
||||
%define GLOBAL +r7-$$
|
||||
shl r1d, 4
|
||||
|
||||
%define sigoffq r8
|
||||
%define sigoffd r8d
|
||||
%define lastoffq r9
|
||||
%define lastoffd r9d
|
||||
%define leveloffq r10
|
||||
%define leveloffd r10d
|
||||
%define leveloffm [rsp+4*0]
|
||||
%define countcatd r11d
|
||||
%define sigoff_8x8 r12
|
||||
%define coeffidxq r13
|
||||
%define coeffidxd r13d
|
||||
%define dct r14
|
||||
%define coeffs rsp+4*2
|
||||
|
||||
lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]
|
||||
add r1d, r2d
|
||||
movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL]
|
||||
movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL]
|
||||
movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]
|
||||
movzx countcatd, byte [count_cat_m1+r2 GLOBAL]
|
||||
mov coeffidxd, -1
|
||||
mov dct, r0
|
||||
mov leveloffm, leveloffd
|
||||
|
||||
COEFF_LAST %1, r2
|
||||
mov lastm, eax
|
||||
; put cabac in r0; needed for cabac_encode_decision
|
||||
mov r0, r3
|
||||
|
||||
xor r10d, r10d
|
||||
cmp countcatd, 63
|
||||
je .sigmap_8x8
|
||||
SIGMAP_LOOP 0, r12d, countcatd
|
||||
.sigmap_8x8:
|
||||
SIGMAP_LOOP 1, r11d, 63, _8x8
|
||||
.level_loop_start:
|
||||
; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop.
|
||||
%define nodectxq r8
|
||||
%define nodectxd r8d
|
||||
mov leveloffd, leveloffm
|
||||
xor nodectxd, nodectxd
|
||||
.level_loop:
|
||||
mov r9d, [coeffs+coeffidxq*4]
|
||||
mov r11d, r9d
|
||||
sar r11d, 31
|
||||
add r9d, r11d
|
||||
movzx r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL]
|
||||
xor r9d, r11d
|
||||
add r1d, leveloffd
|
||||
cmp r9d, 1
|
||||
jg .level_gt1
|
||||
xor r2d, r2d
|
||||
CALL_CABAC
|
||||
movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL]
|
||||
jmp .level_sign
|
||||
.level_gt1:
|
||||
mov r2d, 1
|
||||
CALL_CABAC
|
||||
movzx r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL]
|
||||
add r14d, leveloffd
|
||||
cmp r9d, 15
|
||||
mov r12d, 15
|
||||
cmovl r12d, r9d
|
||||
sub r12d, 2
|
||||
jz .level_eq2
|
||||
.level_gt1_loop:
|
||||
mov r1d, r14d
|
||||
mov r2d, 1
|
||||
CALL_CABAC
|
||||
dec r12d
|
||||
jg .level_gt1_loop
|
||||
cmp r9d, 15
|
||||
jge .level_bypass
|
||||
.level_eq2:
|
||||
mov r1d, r14d
|
||||
xor r2d, r2d
|
||||
CALL_CABAC
|
||||
jmp .level_gt1_end
|
||||
.level_bypass:
|
||||
lea r2d, [r9d-15]
|
||||
xor r1d, r1d
|
||||
push r0
|
||||
; we could avoid this if we implemented it in asm, but I don't feel like that
|
||||
; right now.
|
||||
%if UNIX64
|
||||
push r7
|
||||
push r8
|
||||
%else
|
||||
sub rsp, 40 ; shadow space and alignment
|
||||
%endif
|
||||
call cabac_encode_ue_bypass
|
||||
%if UNIX64
|
||||
pop r8
|
||||
pop r7
|
||||
%else
|
||||
add rsp, 40
|
||||
%endif
|
||||
pop r0
|
||||
.level_gt1_end:
|
||||
movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
|
||||
.level_sign:
|
||||
mov r1d, r11d
|
||||
%if cpuflag(bmi2)
|
||||
call cabac_encode_bypass_bmi2
|
||||
%else
|
||||
call cabac_encode_bypass_asm
|
||||
%endif
|
||||
%if WIN64
|
||||
mov r0, r3
|
||||
%endif
|
||||
dec coeffidxd
|
||||
jge .level_loop
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
CABAC_RESIDUAL coeff_last_sse2
|
||||
INIT_XMM lzcnt
|
||||
CABAC_RESIDUAL coeff_last_lzcnt
|
||||
INIT_XMM avx2
|
||||
CABAC_RESIDUAL coeff_last_avx2
|
||||
INIT_XMM avx512
|
||||
CABAC_RESIDUAL coeff_last_avx512
|
||||
%endif
|
||||
82
common/x86/const-a.asm
Normal file
82
common/x86/const-a.asm
Normal file
@@ -0,0 +1,82 @@
|
||||
;*****************************************************************************
|
||||
;* const-a.asm: x86 global constants
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2010-2025 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Fiona Glaser <fiona@x264.com>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at licensing@x264.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
|
||||
SECTION_RODATA 32
|
||||
|
||||
const pb_1, times 32 db 1
|
||||
const hsub_mul, times 16 db 1, -1
|
||||
const pw_1, times 16 dw 1
|
||||
const pw_16, times 16 dw 16
|
||||
const pw_32, times 16 dw 32
|
||||
const pw_512, times 16 dw 512
|
||||
const pw_00ff, times 16 dw 0x00ff
|
||||
const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
|
||||
const pw_0to15, dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
|
||||
const pd_1, times 8 dd 1
|
||||
const pd_0123, dd 0,1,2,3
|
||||
const pd_4567, dd 4,5,6,7
|
||||
const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
|
||||
const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
|
||||
const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
|
||||
|
||||
const pb_01, times 8 db 0,1
|
||||
const pb_0, times 16 db 0
|
||||
const pb_a1, times 16 db 0xa1
|
||||
const pb_3, times 16 db 3
|
||||
const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
|
||||
|
||||
const pw_2, times 8 dw 2
|
||||
const pw_m2, times 8 dw -2
|
||||
const pw_4, times 8 dw 4
|
||||
const pw_8, times 8 dw 8
|
||||
const pw_64, times 8 dw 64
|
||||
const pw_256, times 8 dw 256
|
||||
const pw_32_0, times 4 dw 32
|
||||
times 4 dw 0
|
||||
const pw_8000, times 8 dw 0x8000
|
||||
const pw_3fff, times 8 dw 0x3fff
|
||||
const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
|
||||
const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
|
||||
const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
|
||||
const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
|
||||
|
||||
const pd_8, times 4 dd 8
|
||||
const pd_32, times 4 dd 32
|
||||
const pd_1024, times 4 dd 1024
|
||||
const pd_ffff, times 4 dd 0xffff
|
||||
const pw_ff00, times 8 dw 0xff00
|
||||
|
||||
const popcnt_table
|
||||
%assign x 0
|
||||
%rep 256
|
||||
; population count
|
||||
db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
|
||||
%assign x x+1
|
||||
%endrep
|
||||
|
||||
const sw_64, dd 64
|
||||
107
common/x86/cpu-a.asm
Normal file
107
common/x86/cpu-a.asm
Normal file
@@ -0,0 +1,107 @@
|
||||
;*****************************************************************************
|
||||
;* cpu-a.asm: x86 cpu utilities
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2003-2025 x264 project
|
||||
;*
|
||||
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||
;* Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Fiona Glaser <fiona@x264.com>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at licensing@x264.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_cpuid, 5,7
|
||||
push rbx
|
||||
push r4
|
||||
push r3
|
||||
push r2
|
||||
push r1
|
||||
mov eax, r0d
|
||||
xor ecx, ecx
|
||||
cpuid
|
||||
pop r4
|
||||
mov [r4], eax
|
||||
pop r4
|
||||
mov [r4], ebx
|
||||
pop r4
|
||||
mov [r4], ecx
|
||||
pop r4
|
||||
mov [r4], edx
|
||||
pop rbx
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; uint64_t cpu_xgetbv( int xcr )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_xgetbv
|
||||
movifnidn ecx, r0m
|
||||
xgetbv
|
||||
%if ARCH_X86_64
|
||||
shl rdx, 32
|
||||
or rax, rdx
|
||||
%endif
|
||||
ret
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void cpu_emms( void )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_emms
|
||||
emms
|
||||
ret
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void cpu_sfence( void )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_sfence
|
||||
sfence
|
||||
ret
|
||||
|
||||
%if ARCH_X86_64 == 0
|
||||
;-----------------------------------------------------------------------------
|
||||
; int cpu_cpuid_test( void )
|
||||
; return 0 if unsupported
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal cpu_cpuid_test
|
||||
pushfd
|
||||
push ebx
|
||||
push ebp
|
||||
push esi
|
||||
push edi
|
||||
pushfd
|
||||
pop eax
|
||||
mov ebx, eax
|
||||
xor eax, 0x200000
|
||||
push eax
|
||||
popfd
|
||||
pushfd
|
||||
pop eax
|
||||
xor eax, ebx
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebp
|
||||
pop ebx
|
||||
popfd
|
||||
ret
|
||||
%endif
|
||||
590
common/x86/dct-32.asm
Normal file
590
common/x86/dct-32.asm
Normal file
@@ -0,0 +1,590 @@
|
||||
;*****************************************************************************
|
||||
;* dct-32.asm: x86_32 transform and zigzag
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2003-2025 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Holger Lubitz <holger@lubitz.org>
|
||||
;* Laurent Aimar <fenrir@via.ecp.fr>
|
||||
;* Min Chen <chenm001.163.com>
|
||||
;* Christian Heine <sennindemokrit@gmx.net>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at licensing@x264.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
cextern pd_32
|
||||
cextern pw_pixel_max
|
||||
cextern pw_2
|
||||
cextern pw_m2
|
||||
cextern pw_32
|
||||
cextern hsub_mul
|
||||
|
||||
%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
|
||||
%xdefine %%base %1
|
||||
%rep %0/2
|
||||
%xdefine %%tmp m%2
|
||||
%rotate %0/2
|
||||
mova [%%base + %2*16], %%tmp
|
||||
%rotate 1-%0/2
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%macro UNSPILL_SHUFFLE 3-*
|
||||
%xdefine %%base %1
|
||||
%rep %0/2
|
||||
%xdefine %%tmp m%2
|
||||
%rotate %0/2
|
||||
mova %%tmp, [%%base + %2*16]
|
||||
%rotate 1-%0/2
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%macro SPILL 2+ ; assume offsets are the same as reg numbers
|
||||
SPILL_SHUFFLE %1, %2, %2
|
||||
%endmacro
|
||||
|
||||
%macro UNSPILL 2+
|
||||
UNSPILL_SHUFFLE %1, %2, %2
|
||||
%endmacro
|
||||
|
||||
; in: size, m0..m7
|
||||
; out: 0,4,6 in memory at %10,%11,%12, rest in regs
|
||||
%macro DCT8_1D 12
|
||||
SUMSUB_BA %1, %9, %2 ; %9 = s07, %2 = d07
|
||||
SUMSUB_BA %1, %8, %3 ; %8 = s16, %3 = d16
|
||||
SUMSUB_BA %1, %7, %4 ; %7 = s25, %4 = d25
|
||||
SUMSUB_BA %1, %6, %5 ; %6 = s34, %5 = d34
|
||||
SUMSUB_BA %1, %6, %9 ; %6 = a0, %9 = a2
|
||||
SUMSUB_BA %1, %7, %8 ; %7 = a1, %8 = a3
|
||||
SUMSUB_BA %1, %7, %6 ; %7 = dst0, %6 = dst4
|
||||
mova %10, m%7
|
||||
mova %11, m%6
|
||||
psra%1 m%7, m%8, 1 ; a3>>1
|
||||
padd%1 m%7, m%9 ; a2 + (a3>>1)
|
||||
psra%1 m%9, 1 ; a2>>1
|
||||
psub%1 m%9, m%8 ; (a2>>1) - a3
|
||||
mova %12, m%9
|
||||
psra%1 m%6, m%4, 1
|
||||
padd%1 m%6, m%4 ; d25+(d25>>1)
|
||||
psub%1 m%8, m%2, m%5 ; a5 = d07-d34-(d25+(d25>>1))
|
||||
psub%1 m%8, m%6
|
||||
psra%1 m%6, m%3, 1
|
||||
padd%1 m%6, m%3 ; d16+(d16>>1)
|
||||
padd%1 m%9, m%2, m%5
|
||||
psub%1 m%9, m%6 ; a6 = d07+d34-(d16+(d16>>1))
|
||||
psra%1 m%6, m%2, 1
|
||||
padd%1 m%6, m%2 ; d07+(d07>>1)
|
||||
padd%1 m%6, m%3
|
||||
padd%1 m%6, m%4 ; a4 = d16+d25+(d07+(d07>>1))
|
||||
psra%1 m%2, m%5, 1
|
||||
padd%1 m%2, m%5 ; d34+(d34>>1)
|
||||
padd%1 m%2, m%3
|
||||
psub%1 m%2, m%4 ; a7 = d16-d25+(d34+(d34>>1))
|
||||
psra%1 m%5, m%2, 2
|
||||
padd%1 m%5, m%6 ; a4 + (a7>>2)
|
||||
psra%1 m%4, m%9, 2
|
||||
padd%1 m%4, m%8 ; a5 + (a6>>2)
|
||||
psra%1 m%6, 2
|
||||
psra%1 m%8, 2
|
||||
psub%1 m%6, m%2 ; (a4>>2) - a7
|
||||
psub%1 m%9, m%8 ; a6 - (a5>>2)
|
||||
SWAP %3, %5, %4, %7, %9, %6
|
||||
%endmacro
|
||||
|
||||
; in: size, m[1,2,3,5,6,7], 0,4 in mem at %10,%11
|
||||
; out: m0..m7
|
||||
%macro IDCT8_1D 11
|
||||
psra%1 m%2, m%4, 1
|
||||
psra%1 m%6, m%8, 1
|
||||
psub%1 m%2, m%8
|
||||
padd%1 m%6, m%4
|
||||
psra%1 m%8, m%3, 1
|
||||
padd%1 m%8, m%3
|
||||
padd%1 m%8, m%5
|
||||
padd%1 m%8, m%7
|
||||
psra%1 m%4, m%7, 1
|
||||
padd%1 m%4, m%7
|
||||
padd%1 m%4, m%9
|
||||
psub%1 m%4, m%3
|
||||
psub%1 m%3, m%5
|
||||
psub%1 m%7, m%5
|
||||
padd%1 m%3, m%9
|
||||
psub%1 m%7, m%9
|
||||
psra%1 m%5, 1
|
||||
psra%1 m%9, 1
|
||||
psub%1 m%3, m%5
|
||||
psub%1 m%7, m%9
|
||||
psra%1 m%5, m%8, 2
|
||||
psra%1 m%9, m%4, 2
|
||||
padd%1 m%5, m%7
|
||||
padd%1 m%9, m%3
|
||||
psra%1 m%7, 2
|
||||
psra%1 m%3, 2
|
||||
psub%1 m%8, m%7
|
||||
psub%1 m%3, m%4
|
||||
mova m%4, %10
|
||||
mova m%7, %11
|
||||
SUMSUB_BA %1, %7, %4
|
||||
SUMSUB_BA %1, %6, %7
|
||||
SUMSUB_BA %1, %2, %4
|
||||
SUMSUB_BA %1, %8, %6
|
||||
SUMSUB_BA %1, %3, %2
|
||||
SUMSUB_BA %1, %9, %4
|
||||
SUMSUB_BA %1, %5, %7
|
||||
SWAP %2, %4
|
||||
SWAP %6, %8
|
||||
SWAP %2, %6, %7
|
||||
SWAP %4, %9, %8
|
||||
%endmacro
|
||||
|
||||
%if HIGH_BIT_DEPTH
|
||||
|
||||
%macro SUB8x8_DCT8 0
|
||||
cglobal sub8x8_dct8, 3,3,8
|
||||
cglobal_label .skip_prologue
|
||||
LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
|
||||
LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
|
||||
|
||||
DCT8_1D w, 0,1,2,3,4,5,6,7, [r0],[r0+0x10],[r0+0x50]
|
||||
mova m0, [r0]
|
||||
|
||||
mova [r0+0x30], m5
|
||||
mova [r0+0x70], m7
|
||||
TRANSPOSE4x4W 0,1,2,3,4
|
||||
WIDEN_SXWD 0,4
|
||||
WIDEN_SXWD 1,5
|
||||
WIDEN_SXWD 2,6
|
||||
WIDEN_SXWD 3,7
|
||||
DCT8_1D d, 0,4,1,5,2,6,3,7, [r0],[r0+0x80],[r0+0xC0]
|
||||
mova [r0+0x20], m4
|
||||
mova [r0+0x40], m1
|
||||
mova [r0+0x60], m5
|
||||
mova [r0+0xA0], m6
|
||||
mova [r0+0xE0], m7
|
||||
mova m4, [r0+0x10]
|
||||
mova m5, [r0+0x30]
|
||||
mova m6, [r0+0x50]
|
||||
mova m7, [r0+0x70]
|
||||
|
||||
TRANSPOSE4x4W 4,5,6,7,0
|
||||
WIDEN_SXWD 4,0
|
||||
WIDEN_SXWD 5,1
|
||||
WIDEN_SXWD 6,2
|
||||
WIDEN_SXWD 7,3
|
||||
DCT8_1D d,4,0,5,1,6,2,7,3, [r0+0x10],[r0+0x90],[r0+0xD0]
|
||||
mova [r0+0x30], m0
|
||||
mova [r0+0x50], m5
|
||||
mova [r0+0x70], m1
|
||||
mova [r0+0xB0], m2
|
||||
mova [r0+0xF0], m3
|
||||
ret
|
||||
%endmacro ; SUB8x8_DCT8
|
||||
|
||||
INIT_XMM sse2
|
||||
SUB8x8_DCT8
|
||||
INIT_XMM sse4
|
||||
SUB8x8_DCT8
|
||||
INIT_XMM avx
|
||||
SUB8x8_DCT8
|
||||
|
||||
%macro ADD8x8_IDCT8 0
|
||||
cglobal add8x8_idct8, 2,2
|
||||
add r1, 128
|
||||
cglobal_label .skip_prologue
|
||||
UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -6,-4,-2,2,4,6
|
||||
IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-128],[r1+0]
|
||||
mova [r1+0], m4
|
||||
TRANSPOSE4x4D 0,1,2,3,4
|
||||
paddd m0, [pd_32]
|
||||
mova m4, [r1+0]
|
||||
SPILL_SHUFFLE r1, 0,1,2,3, -8,-6,-4,-2
|
||||
TRANSPOSE4x4D 4,5,6,7,3
|
||||
paddd m4, [pd_32]
|
||||
SPILL_SHUFFLE r1, 4,5,6,7, 0,2,4,6
|
||||
UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -5,-3,-1,3,5,7
|
||||
IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-112],[r1+16]
|
||||
mova [r1+16], m4
|
||||
TRANSPOSE4x4D 0,1,2,3,4
|
||||
mova m4, [r1+16]
|
||||
mova [r1-112], m0
|
||||
TRANSPOSE4x4D 4,5,6,7,0
|
||||
SPILL_SHUFFLE r1, 4,5,6,7, 1,3,5,7
|
||||
UNSPILL_SHUFFLE r1, 5,6,7, -6,-4,-2
|
||||
IDCT8_1D d,4,5,6,7,0,1,2,3,[r1-128],[r1-112]
|
||||
SPILL_SHUFFLE r1, 4,5,6,7,0,1,2,3, -8,-7,-6,-5,-4,-3,-2,-1
|
||||
UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, 2,4,6,3,5,7
|
||||
IDCT8_1D d,0,1,2,3,4,5,6,7,[r1+0],[r1+16]
|
||||
SPILL_SHUFFLE r1, 7,6,5, 7,6,5
|
||||
mova m7, [pw_pixel_max]
|
||||
pxor m6, m6
|
||||
mova m5, [r1-128]
|
||||
STORE_DIFF m5, m0, m6, m7, [r0+0*FDEC_STRIDEB]
|
||||
mova m0, [r1-112]
|
||||
STORE_DIFF m0, m1, m6, m7, [r0+1*FDEC_STRIDEB]
|
||||
mova m0, [r1-96]
|
||||
STORE_DIFF m0, m2, m6, m7, [r0+2*FDEC_STRIDEB]
|
||||
mova m0, [r1-80]
|
||||
STORE_DIFF m0, m3, m6, m7, [r0+3*FDEC_STRIDEB]
|
||||
mova m0, [r1-64]
|
||||
STORE_DIFF m0, m4, m6, m7, [r0+4*FDEC_STRIDEB]
|
||||
mova m0, [r1-48]
|
||||
mova m1, [r1+80]
|
||||
STORE_DIFF m0, m1, m6, m7, [r0+5*FDEC_STRIDEB]
|
||||
mova m0, [r1-32]
|
||||
mova m1, [r1+96]
|
||||
STORE_DIFF m0, m1, m6, m7, [r0+6*FDEC_STRIDEB]
|
||||
mova m0, [r1-16]
|
||||
mova m1, [r1+112]
|
||||
STORE_DIFF m0, m1, m6, m7, [r0+7*FDEC_STRIDEB]
|
||||
RET
|
||||
%endmacro ; ADD8x8_IDCT8
|
||||
|
||||
INIT_XMM sse2
|
||||
ADD8x8_IDCT8
|
||||
INIT_XMM avx
|
||||
ADD8x8_IDCT8
|
||||
|
||||
%else ; !HIGH_BIT_DEPTH
|
||||
|
||||
INIT_MMX
|
||||
ALIGN 16
|
||||
load_diff_4x8_mmx:
|
||||
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
|
||||
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
|
||||
LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
|
||||
LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
|
||||
LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
|
||||
LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
|
||||
movq [r0], m0
|
||||
LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
|
||||
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
|
||||
movq m0, [r0]
|
||||
ret
|
||||
|
||||
cglobal dct8_mmx
|
||||
DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
|
||||
SAVE_MM_PERMUTATION
|
||||
ret
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal sub8x8_dct8_mmx, 3,3
|
||||
global sub8x8_dct8_mmx.skip_prologue
|
||||
.skip_prologue:
|
||||
RESET_MM_PERMUTATION
|
||||
call load_diff_4x8_mmx
|
||||
call dct8_mmx
|
||||
UNSPILL r0, 0
|
||||
TRANSPOSE4x4W 0,1,2,3,4
|
||||
SPILL r0, 0,1,2,3
|
||||
UNSPILL r0, 4,6
|
||||
TRANSPOSE4x4W 4,5,6,7,0
|
||||
SPILL r0, 4,5,6,7
|
||||
RESET_MM_PERMUTATION
|
||||
add r1, 4
|
||||
add r2, 4
|
||||
add r0, 8
|
||||
call load_diff_4x8_mmx
|
||||
sub r1, 4
|
||||
sub r2, 4
|
||||
call dct8_mmx
|
||||
sub r0, 8
|
||||
UNSPILL r0+8, 4,6
|
||||
TRANSPOSE4x4W 4,5,6,7,0
|
||||
SPILL r0+8, 4,5,6,7
|
||||
UNSPILL r0+8, 0
|
||||
TRANSPOSE4x4W 0,1,2,3,5
|
||||
UNSPILL r0, 4,5,6,7
|
||||
SPILL_SHUFFLE r0, 0,1,2,3, 4,5,6,7
|
||||
movq mm4, m6 ; depends on the permutation to not produce conflicts
|
||||
movq mm0, m4
|
||||
movq mm1, m5
|
||||
movq mm2, mm4
|
||||
movq mm3, m7
|
||||
RESET_MM_PERMUTATION
|
||||
UNSPILL r0+8, 4,5,6,7
|
||||
add r0, 8
|
||||
call dct8_mmx
|
||||
sub r0, 8
|
||||
SPILL r0+8, 1,2,3,5,7
|
||||
RESET_MM_PERMUTATION
|
||||
UNSPILL r0, 0,1,2,3,4,5,6,7
|
||||
call dct8_mmx
|
||||
SPILL r0, 1,2,3,5,7
|
||||
ret
|
||||
|
||||
cglobal idct8_mmx
|
||||
IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
|
||||
SAVE_MM_PERMUTATION
|
||||
ret
|
||||
|
||||
%macro ADD_STORE_ROW 3
|
||||
movq m1, [r0+%1*FDEC_STRIDE]
|
||||
punpckhbw m2, m1, m0
|
||||
punpcklbw m1, m0
|
||||
paddw m1, %2
|
||||
paddw m2, %3
|
||||
packuswb m1, m2
|
||||
movq [r0+%1*FDEC_STRIDE], m1
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal add8x8_idct8_mmx, 2,2
|
||||
global add8x8_idct8_mmx.skip_prologue
|
||||
.skip_prologue:
|
||||
INIT_MMX
|
||||
add word [r1], 32
|
||||
UNSPILL r1, 1,2,3,5,6,7
|
||||
call idct8_mmx
|
||||
SPILL r1, 7
|
||||
TRANSPOSE4x4W 0,1,2,3,7
|
||||
SPILL r1, 0,1,2,3
|
||||
UNSPILL r1, 7
|
||||
TRANSPOSE4x4W 4,5,6,7,0
|
||||
SPILL r1, 4,5,6,7
|
||||
INIT_MMX
|
||||
UNSPILL r1+8, 1,2,3,5,6,7
|
||||
add r1, 8
|
||||
call idct8_mmx
|
||||
sub r1, 8
|
||||
SPILL r1+8, 7
|
||||
TRANSPOSE4x4W 0,1,2,3,7
|
||||
SPILL r1+8, 0,1,2,3
|
||||
UNSPILL r1+8, 7
|
||||
TRANSPOSE4x4W 4,5,6,7,0
|
||||
SPILL r1+8, 4,5,6,7
|
||||
INIT_MMX
|
||||
movq m3, [r1+0x08]
|
||||
movq m0, [r1+0x40]
|
||||
movq [r1+0x40], m3
|
||||
movq [r1+0x08], m0
|
||||
; memory layout at this time:
|
||||
; A0------ A1------
|
||||
; B0------ F0------
|
||||
; C0------ G0------
|
||||
; D0------ H0------
|
||||
; E0------ E1------
|
||||
; B1------ F1------
|
||||
; C1------ G1------
|
||||
; D1------ H1------
|
||||
UNSPILL_SHUFFLE r1, 1,2,3, 5,6,7
|
||||
UNSPILL r1+8, 5,6,7
|
||||
add r1, 8
|
||||
call idct8_mmx
|
||||
sub r1, 8
|
||||
psraw m0, 6
|
||||
psraw m1, 6
|
||||
psraw m2, 6
|
||||
psraw m3, 6
|
||||
psraw m4, 6
|
||||
psraw m5, 6
|
||||
psraw m6, 6
|
||||
psraw m7, 6
|
||||
movq [r1+0x08], m0 ; mm4
|
||||
movq [r1+0x48], m4 ; mm5
|
||||
movq [r1+0x58], m5 ; mm0
|
||||
movq [r1+0x68], m6 ; mm2
|
||||
movq [r1+0x78], m7 ; mm6
|
||||
movq mm5, [r1+0x18]
|
||||
movq mm6, [r1+0x28]
|
||||
movq [r1+0x18], m1 ; mm1
|
||||
movq [r1+0x28], m2 ; mm7
|
||||
movq mm7, [r1+0x38]
|
||||
movq [r1+0x38], m3 ; mm3
|
||||
movq mm1, [r1+0x10]
|
||||
movq mm2, [r1+0x20]
|
||||
movq mm3, [r1+0x30]
|
||||
call idct8_mmx
|
||||
psraw m0, 6
|
||||
psraw m1, 6
|
||||
psraw m2, 6
|
||||
psraw m3, 6
|
||||
psraw m4, 6
|
||||
psraw m5, 6
|
||||
psraw m6, 6
|
||||
psraw m7, 6
|
||||
SPILL r1, 0,1,2
|
||||
pxor m0, m0
|
||||
ADD_STORE_ROW 0, [r1+0x00], [r1+0x08]
|
||||
ADD_STORE_ROW 1, [r1+0x10], [r1+0x18]
|
||||
ADD_STORE_ROW 2, [r1+0x20], [r1+0x28]
|
||||
ADD_STORE_ROW 3, m3, [r1+0x38]
|
||||
ADD_STORE_ROW 4, m4, [r1+0x48]
|
||||
ADD_STORE_ROW 5, m5, [r1+0x58]
|
||||
ADD_STORE_ROW 6, m6, [r1+0x68]
|
||||
ADD_STORE_ROW 7, m7, [r1+0x78]
|
||||
ret
|
||||
|
||||
%macro DCT_SUB8 0
|
||||
cglobal sub8x8_dct, 3,3
|
||||
add r2, 4*FDEC_STRIDE
|
||||
cglobal_label .skip_prologue
|
||||
%if cpuflag(ssse3)
|
||||
mova m7, [hsub_mul]
|
||||
%endif
|
||||
LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
|
||||
SPILL r0, 1,2
|
||||
SWAP 2, 7
|
||||
LOAD_DIFF8x4 4, 5, 6, 7, 1, 2, r1, r2-4*FDEC_STRIDE
|
||||
UNSPILL r0, 1
|
||||
SPILL r0, 7
|
||||
SWAP 2, 7
|
||||
UNSPILL r0, 2
|
||||
DCT4_1D 0, 1, 2, 3, 7
|
||||
TRANSPOSE2x4x4W 0, 1, 2, 3, 7
|
||||
UNSPILL r0, 7
|
||||
SPILL r0, 2
|
||||
DCT4_1D 4, 5, 6, 7, 2
|
||||
TRANSPOSE2x4x4W 4, 5, 6, 7, 2
|
||||
UNSPILL r0, 2
|
||||
SPILL r0, 6
|
||||
DCT4_1D 0, 1, 2, 3, 6
|
||||
UNSPILL r0, 6
|
||||
STORE_DCT 0, 1, 2, 3, r0, 0
|
||||
DCT4_1D 4, 5, 6, 7, 3
|
||||
STORE_DCT 4, 5, 6, 7, r0, 64
|
||||
ret
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal sub8x8_dct8, 3,3
|
||||
add r2, 4*FDEC_STRIDE
|
||||
cglobal_label .skip_prologue
|
||||
%if cpuflag(ssse3)
|
||||
mova m7, [hsub_mul]
|
||||
LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
|
||||
SPILL r0, 0,1
|
||||
SWAP 1, 7
|
||||
LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE
|
||||
UNSPILL r0, 0,1
|
||||
%else
|
||||
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
|
||||
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2-3*FDEC_STRIDE]
|
||||
LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2-2*FDEC_STRIDE]
|
||||
LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2-1*FDEC_STRIDE]
|
||||
LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+0*FDEC_STRIDE]
|
||||
LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+1*FDEC_STRIDE]
|
||||
SPILL r0, 0
|
||||
LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+2*FDEC_STRIDE]
|
||||
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
|
||||
UNSPILL r0, 0
|
||||
%endif
|
||||
DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
|
||||
UNSPILL r0, 0,4
|
||||
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
|
||||
UNSPILL r0, 4
|
||||
DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
|
||||
SPILL r0, 1,2,3,5,7
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
%define movdqa movaps
|
||||
%define punpcklqdq movlhps
|
||||
DCT_SUB8
|
||||
%undef movdqa
|
||||
%undef punpcklqdq
|
||||
INIT_XMM ssse3
|
||||
DCT_SUB8
|
||||
INIT_XMM avx
|
||||
DCT_SUB8
|
||||
INIT_XMM xop
|
||||
DCT_SUB8
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro ADD8x8 0
|
||||
cglobal add8x8_idct, 2,2
|
||||
add r0, 4*FDEC_STRIDE
|
||||
cglobal_label .skip_prologue
|
||||
UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
|
||||
SBUTTERFLY qdq, 0, 1, 4
|
||||
SBUTTERFLY qdq, 2, 3, 4
|
||||
UNSPILL_SHUFFLE r1, 4,6,5,7, 4,5,6,7
|
||||
SPILL r1, 0
|
||||
SBUTTERFLY qdq, 4, 5, 0
|
||||
SBUTTERFLY qdq, 6, 7, 0
|
||||
UNSPILL r1,0
|
||||
IDCT4_1D w,0,1,2,3,r1
|
||||
SPILL r1, 4
|
||||
TRANSPOSE2x4x4W 0,1,2,3,4
|
||||
UNSPILL r1, 4
|
||||
IDCT4_1D w,4,5,6,7,r1
|
||||
SPILL r1, 0
|
||||
TRANSPOSE2x4x4W 4,5,6,7,0
|
||||
UNSPILL r1, 0
|
||||
paddw m0, [pw_32]
|
||||
IDCT4_1D w,0,1,2,3,r1
|
||||
paddw m4, [pw_32]
|
||||
IDCT4_1D w,4,5,6,7,r1
|
||||
SPILL r1, 6,7
|
||||
pxor m7, m7
|
||||
DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
|
||||
DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
|
||||
UNSPILL_SHUFFLE r1, 0,2, 6,7
|
||||
DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
|
||||
DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
|
||||
STORE_IDCT m1, m3, m5, m2
|
||||
ret
|
||||
%endmacro ; ADD8x8
|
||||
|
||||
INIT_XMM sse2
|
||||
ADD8x8
|
||||
INIT_XMM avx
|
||||
ADD8x8
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro ADD8x8_IDCT8 0
|
||||
cglobal add8x8_idct8, 2,2
|
||||
add r0, 4*FDEC_STRIDE
|
||||
cglobal_label .skip_prologue
|
||||
UNSPILL r1, 1,2,3,5,6,7
|
||||
IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
|
||||
SPILL r1, 6
|
||||
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
|
||||
paddw m0, [pw_32]
|
||||
SPILL r1, 0
|
||||
IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
|
||||
SPILL r1, 6,7
|
||||
pxor m7, m7
|
||||
DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
|
||||
DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
|
||||
UNSPILL_SHUFFLE r1, 0,2, 6,7
|
||||
DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
|
||||
DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
|
||||
STORE_IDCT m1, m3, m5, m2
|
||||
ret
|
||||
%endmacro ; ADD8x8_IDCT8
|
||||
|
||||
INIT_XMM sse2
|
||||
ADD8x8_IDCT8
|
||||
INIT_XMM avx
|
||||
ADD8x8_IDCT8
|
||||
%endif ; !HIGH_BIT_DEPTH
|
||||
424
common/x86/dct-64.asm
Normal file
424
common/x86/dct-64.asm
Normal file
@@ -0,0 +1,424 @@
|
||||
;*****************************************************************************
|
||||
;* dct-64.asm: x86_64 transform and zigzag
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2003-2025 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Holger Lubitz <holger@lubitz.org>
|
||||
;* Laurent Aimar <fenrir@via.ecp.fr>
|
||||
;* Min Chen <chenm001.163.com>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at licensing@x264.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
cextern pd_32
|
||||
cextern pw_pixel_max
|
||||
cextern pw_2
|
||||
cextern pw_m2
|
||||
cextern pw_32
|
||||
cextern hsub_mul
|
||||
|
||||
; in: size, m0..m7, temp, temp
|
||||
; out: m0..m7
|
||||
%macro DCT8_1D 11
|
||||
SUMSUB_BA %1, %6, %5, %11 ; %6=s34, %5=d34
|
||||
SUMSUB_BA %1, %7, %4, %11 ; %7=s25, %4=d25
|
||||
SUMSUB_BA %1, %8, %3, %11 ; %8=s16, %3=d16
|
||||
SUMSUB_BA %1, %9, %2, %11 ; %9=s07, %2=d07
|
||||
|
||||
SUMSUB_BA %1, %7, %8, %11 ; %7=a1, %8=a3
|
||||
SUMSUB_BA %1, %6, %9, %11 ; %6=a0, %9=a2
|
||||
|
||||
psra%1 m%10, m%2, 1
|
||||
padd%1 m%10, m%2
|
||||
padd%1 m%10, m%3
|
||||
padd%1 m%10, m%4 ; %10=a4
|
||||
|
||||
psra%1 m%11, m%5, 1
|
||||
padd%1 m%11, m%5
|
||||
padd%1 m%11, m%3
|
||||
psub%1 m%11, m%4 ; %11=a7
|
||||
|
||||
SUMSUB_BA %1, %5, %2
|
||||
psub%1 m%2, m%4
|
||||
psub%1 m%5, m%3
|
||||
psra%1 m%4, 1
|
||||
psra%1 m%3, 1
|
||||
psub%1 m%2, m%4 ; %2=a5
|
||||
psub%1 m%5, m%3 ; %5=a6
|
||||
|
||||
psra%1 m%3, m%11, 2
|
||||
padd%1 m%3, m%10 ; %3=b1
|
||||
psra%1 m%10, 2
|
||||
psub%1 m%10, m%11 ; %10=b7
|
||||
|
||||
SUMSUB_BA %1, %7, %6, %11 ; %7=b0, %6=b4
|
||||
|
||||
psra%1 m%4, m%8, 1
|
||||
padd%1 m%4, m%9 ; %4=b2
|
||||
psra%1 m%9, 1
|
||||
psub%1 m%9, m%8 ; %9=b6
|
||||
|
||||
psra%1 m%8, m%5, 2
|
||||
padd%1 m%8, m%2 ; %8=b3
|
||||
psra%1 m%2, 2
|
||||
psub%1 m%5, m%2 ; %5=b5
|
||||
|
||||
SWAP %2, %7, %5, %8, %9, %10
|
||||
%endmacro
|
||||
|
||||
%macro IDCT8_1D 11
|
||||
SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2
|
||||
|
||||
psra%1 m%10, m%3, 1
|
||||
padd%1 m%10, m%3
|
||||
padd%1 m%10, m%5
|
||||
padd%1 m%10, m%7 ; %9=a7
|
||||
|
||||
psra%1 m%11, m%4, 1
|
||||
psub%1 m%11, m%8 ; %10=a4
|
||||
psra%1 m%8, 1
|
||||
padd%1 m%8, m%4 ; %7=a6
|
||||
|
||||
psra%1 m%4, m%7, 1
|
||||
padd%1 m%4, m%7
|
||||
padd%1 m%4, m%9
|
||||
psub%1 m%4, m%3 ; %3=a5
|
||||
|
||||
psub%1 m%3, m%5
|
||||
psub%1 m%7, m%5
|
||||
padd%1 m%3, m%9
|
||||
psub%1 m%7, m%9
|
||||
psra%1 m%5, 1
|
||||
psra%1 m%9, 1
|
||||
psub%1 m%3, m%5 ; %2=a3
|
||||
psub%1 m%7, m%9 ; %6=a1
|
||||
|
||||
psra%1 m%5, m%10, 2
|
||||
padd%1 m%5, m%7 ; %4=b1
|
||||
psra%1 m%7, 2
|
||||
psub%1 m%10, m%7 ; %9=b7
|
||||
|
||||
SUMSUB_BA %1, %8, %6, %7 ; %7=b0, %5=b6
|
||||
SUMSUB_BA %1, %11, %2, %7 ; %10=b2, %1=b4
|
||||
|
||||
psra%1 m%9, m%4, 2
|
||||
padd%1 m%9, m%3 ; %8=b3
|
||||
psra%1 m%3, 2
|
||||
psub%1 m%3, m%4 ; %2=b5
|
||||
|
||||
SUMSUB_BA %1, %10, %8, %7 ; %9=c0, %7=c7
|
||||
SUMSUB_BA %1, %3, %11, %7 ; %2=c1, %10=c6
|
||||
SUMSUB_BA %1, %9, %2, %7 ; %8=c2, %1=c5
|
||||
SUMSUB_BA %1, %5, %6, %7 ; %4=c3, %5=c4
|
||||
|
||||
SWAP %11, %4
|
||||
SWAP %2, %10, %7
|
||||
SWAP %4, %9, %8
|
||||
%endmacro
|
||||
|
||||
%if HIGH_BIT_DEPTH
|
||||
|
||||
%macro SUB8x8_DCT8 0
|
||||
cglobal sub8x8_dct8, 3,3,14
|
||||
TAIL_CALL .skip_prologue, 0
|
||||
cglobal_label .skip_prologue
|
||||
LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
|
||||
LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
|
||||
|
||||
DCT8_1D w, 0,1,2,3,4,5,6,7, 8,9
|
||||
|
||||
TRANSPOSE4x4W 0,1,2,3,8
|
||||
WIDEN_SXWD 0,8
|
||||
WIDEN_SXWD 1,9
|
||||
WIDEN_SXWD 2,10
|
||||
WIDEN_SXWD 3,11
|
||||
DCT8_1D d, 0,8,1,9,2,10,3,11, 12,13
|
||||
mova [r0+0x00], m0
|
||||
mova [r0+0x20], m8
|
||||
mova [r0+0x40], m1
|
||||
mova [r0+0x60], m9
|
||||
mova [r0+0x80], m2
|
||||
mova [r0+0xA0], m10
|
||||
mova [r0+0xC0], m3
|
||||
mova [r0+0xE0], m11
|
||||
|
||||
TRANSPOSE4x4W 4,5,6,7,0
|
||||
WIDEN_SXWD 4,0
|
||||
WIDEN_SXWD 5,1
|
||||
WIDEN_SXWD 6,2
|
||||
WIDEN_SXWD 7,3
|
||||
DCT8_1D d,4,0,5,1,6,2,7,3, 8,9
|
||||
mova [r0+0x10], m4
|
||||
mova [r0+0x30], m0
|
||||
mova [r0+0x50], m5
|
||||
mova [r0+0x70], m1
|
||||
mova [r0+0x90], m6
|
||||
mova [r0+0xB0], m2
|
||||
mova [r0+0xD0], m7
|
||||
mova [r0+0xF0], m3
|
||||
ret
|
||||
%endmacro ; SUB8x8_DCT8
|
||||
|
||||
INIT_XMM sse2
|
||||
SUB8x8_DCT8
|
||||
INIT_XMM sse4
|
||||
SUB8x8_DCT8
|
||||
INIT_XMM avx
|
||||
SUB8x8_DCT8
|
||||
|
||||
%macro ADD8x8_IDCT8 0
|
||||
cglobal add8x8_idct8, 2,2,16
|
||||
add r1, 128
|
||||
TAIL_CALL .skip_prologue, 0
|
||||
cglobal_label .skip_prologue
|
||||
mova m0, [r1-128]
|
||||
mova m1, [r1-96]
|
||||
mova m2, [r1-64]
|
||||
mova m3, [r1-32]
|
||||
mova m4, [r1+ 0]
|
||||
mova m5, [r1+32]
|
||||
mova m6, [r1+64]
|
||||
mova m7, [r1+96]
|
||||
IDCT8_1D d,0,1,2,3,4,5,6,7,8,9
|
||||
TRANSPOSE4x4D 0,1,2,3,8
|
||||
TRANSPOSE4x4D 4,5,6,7,8
|
||||
paddd m0, [pd_32]
|
||||
paddd m4, [pd_32]
|
||||
mova [r1+64], m6
|
||||
mova [r1+96], m7
|
||||
mova m8, [r1-112]
|
||||
mova m9, [r1-80]
|
||||
mova m10, [r1-48]
|
||||
mova m11, [r1-16]
|
||||
mova m12, [r1+16]
|
||||
mova m13, [r1+48]
|
||||
mova m14, [r1+80]
|
||||
mova m15, [r1+112]
|
||||
IDCT8_1D d,8,9,10,11,12,13,14,15,6,7
|
||||
TRANSPOSE4x4D 8,9,10,11,6
|
||||
TRANSPOSE4x4D 12,13,14,15,6
|
||||
IDCT8_1D d,0,1,2,3,8,9,10,11,6,7
|
||||
mova [r1-112], m8
|
||||
mova [r1-80], m9
|
||||
mova m6, [r1+64]
|
||||
mova m7, [r1+96]
|
||||
IDCT8_1D d,4,5,6,7,12,13,14,15,8,9
|
||||
pxor m8, m8
|
||||
mova m9, [pw_pixel_max]
|
||||
STORE_DIFF m0, m4, m8, m9, [r0+0*FDEC_STRIDEB]
|
||||
STORE_DIFF m1, m5, m8, m9, [r0+1*FDEC_STRIDEB]
|
||||
STORE_DIFF m2, m6, m8, m9, [r0+2*FDEC_STRIDEB]
|
||||
STORE_DIFF m3, m7, m8, m9, [r0+3*FDEC_STRIDEB]
|
||||
mova m0, [r1-112]
|
||||
mova m1, [r1-80]
|
||||
STORE_DIFF m0, m12, m8, m9, [r0+4*FDEC_STRIDEB]
|
||||
STORE_DIFF m1, m13, m8, m9, [r0+5*FDEC_STRIDEB]
|
||||
STORE_DIFF m10, m14, m8, m9, [r0+6*FDEC_STRIDEB]
|
||||
STORE_DIFF m11, m15, m8, m9, [r0+7*FDEC_STRIDEB]
|
||||
ret
|
||||
%endmacro ; ADD8x8_IDCT8
|
||||
|
||||
INIT_XMM sse2
|
||||
ADD8x8_IDCT8
|
||||
INIT_XMM avx
|
||||
ADD8x8_IDCT8
|
||||
|
||||
%else ; !HIGH_BIT_DEPTH
|
||||
|
||||
%macro DCT_SUB8 0
|
||||
cglobal sub8x8_dct, 3,3,10
|
||||
add r2, 4*FDEC_STRIDE
|
||||
%if cpuflag(ssse3)
|
||||
mova m7, [hsub_mul]
|
||||
%endif
|
||||
TAIL_CALL .skip_prologue, 0
|
||||
cglobal_label .skip_prologue
|
||||
SWAP 7, 9
|
||||
LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
|
||||
LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE
|
||||
DCT4_1D 0, 1, 2, 3, 8
|
||||
TRANSPOSE2x4x4W 0, 1, 2, 3, 8
|
||||
DCT4_1D 4, 5, 6, 7, 8
|
||||
TRANSPOSE2x4x4W 4, 5, 6, 7, 8
|
||||
DCT4_1D 0, 1, 2, 3, 8
|
||||
STORE_DCT 0, 1, 2, 3, r0, 0
|
||||
DCT4_1D 4, 5, 6, 7, 8
|
||||
STORE_DCT 4, 5, 6, 7, r0, 64
|
||||
ret
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal sub8x8_dct8, 3,3,11
|
||||
add r2, 4*FDEC_STRIDE
|
||||
%if cpuflag(ssse3)
|
||||
mova m7, [hsub_mul]
|
||||
%endif
|
||||
TAIL_CALL .skip_prologue, 0
|
||||
cglobal_label .skip_prologue
|
||||
SWAP 7, 10
|
||||
LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
|
||||
LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
|
||||
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
|
||||
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
|
||||
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
|
||||
movdqa [r0+0x00], m0
|
||||
movdqa [r0+0x10], m1
|
||||
movdqa [r0+0x20], m2
|
||||
movdqa [r0+0x30], m3
|
||||
movdqa [r0+0x40], m4
|
||||
movdqa [r0+0x50], m5
|
||||
movdqa [r0+0x60], m6
|
||||
movdqa [r0+0x70], m7
|
||||
ret
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
%define movdqa movaps
|
||||
%define punpcklqdq movlhps
|
||||
DCT_SUB8
|
||||
%undef movdqa
|
||||
%undef punpcklqdq
|
||||
INIT_XMM ssse3
|
||||
DCT_SUB8
|
||||
INIT_XMM avx
|
||||
DCT_SUB8
|
||||
INIT_XMM xop
|
||||
DCT_SUB8
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal sub16x16_dct8, 3,3,10
|
||||
add r0, 128
|
||||
add r2, 4*FDEC_STRIDE
|
||||
call .sub16x8_dct8
|
||||
add r0, 256
|
||||
add r1, FENC_STRIDE*8
|
||||
add r2, FDEC_STRIDE*8
|
||||
call .sub16x8_dct8
|
||||
RET
|
||||
.sub16x8_dct8:
|
||||
LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1
|
||||
LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
|
||||
LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5
|
||||
LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7
|
||||
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
|
||||
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
|
||||
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
|
||||
mova [r0-0x80+0x00], xm0
|
||||
vextracti128 [r0+0x00], m0, 1
|
||||
mova [r0-0x80+0x10], xm1
|
||||
vextracti128 [r0+0x10], m1, 1
|
||||
mova [r0-0x80+0x20], xm2
|
||||
vextracti128 [r0+0x20], m2, 1
|
||||
mova [r0-0x80+0x30], xm3
|
||||
vextracti128 [r0+0x30], m3, 1
|
||||
mova [r0-0x80+0x40], xm4
|
||||
vextracti128 [r0+0x40], m4, 1
|
||||
mova [r0-0x80+0x50], xm5
|
||||
vextracti128 [r0+0x50], m5, 1
|
||||
mova [r0-0x80+0x60], xm6
|
||||
vextracti128 [r0+0x60], m6, 1
|
||||
mova [r0-0x80+0x70], xm7
|
||||
vextracti128 [r0+0x70], m7, 1
|
||||
ret
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro ADD8x8_IDCT8 0
|
||||
cglobal add8x8_idct8, 2,2,11
|
||||
add r0, 4*FDEC_STRIDE
|
||||
pxor m7, m7
|
||||
TAIL_CALL .skip_prologue, 0
|
||||
cglobal_label .skip_prologue
|
||||
SWAP 7, 9
|
||||
movdqa m0, [r1+0x00]
|
||||
movdqa m1, [r1+0x10]
|
||||
movdqa m2, [r1+0x20]
|
||||
movdqa m3, [r1+0x30]
|
||||
movdqa m4, [r1+0x40]
|
||||
movdqa m5, [r1+0x50]
|
||||
movdqa m6, [r1+0x60]
|
||||
movdqa m7, [r1+0x70]
|
||||
IDCT8_1D w,0,1,2,3,4,5,6,7,8,10
|
||||
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
|
||||
paddw m0, [pw_32] ; rounding for the >>6 at the end
|
||||
IDCT8_1D w,0,1,2,3,4,5,6,7,8,10
|
||||
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
|
||||
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
|
||||
DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
|
||||
DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
|
||||
STORE_IDCT m1, m3, m5, m7
|
||||
ret
|
||||
%endmacro ; ADD8x8_IDCT8
|
||||
|
||||
INIT_XMM sse2
|
||||
ADD8x8_IDCT8
|
||||
INIT_XMM avx
|
||||
ADD8x8_IDCT8
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro ADD8x8 0
|
||||
cglobal add8x8_idct, 2,2,11
|
||||
add r0, 4*FDEC_STRIDE
|
||||
pxor m7, m7
|
||||
TAIL_CALL .skip_prologue, 0
|
||||
cglobal_label .skip_prologue
|
||||
SWAP 7, 9
|
||||
mova m0, [r1+ 0]
|
||||
mova m2, [r1+16]
|
||||
mova m1, [r1+32]
|
||||
mova m3, [r1+48]
|
||||
SBUTTERFLY qdq, 0, 1, 4
|
||||
SBUTTERFLY qdq, 2, 3, 4
|
||||
mova m4, [r1+64]
|
||||
mova m6, [r1+80]
|
||||
mova m5, [r1+96]
|
||||
mova m7, [r1+112]
|
||||
SBUTTERFLY qdq, 4, 5, 8
|
||||
SBUTTERFLY qdq, 6, 7, 8
|
||||
IDCT4_1D w,0,1,2,3,8,10
|
||||
TRANSPOSE2x4x4W 0,1,2,3,8
|
||||
IDCT4_1D w,4,5,6,7,8,10
|
||||
TRANSPOSE2x4x4W 4,5,6,7,8
|
||||
paddw m0, [pw_32]
|
||||
IDCT4_1D w,0,1,2,3,8,10
|
||||
paddw m4, [pw_32]
|
||||
IDCT4_1D w,4,5,6,7,8,10
|
||||
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
|
||||
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
|
||||
DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
|
||||
DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
|
||||
STORE_IDCT m1, m3, m5, m7
|
||||
ret
|
||||
%endmacro ; ADD8x8
|
||||
|
||||
INIT_XMM sse2
|
||||
ADD8x8
|
||||
INIT_XMM avx
|
||||
ADD8x8
|
||||
|
||||
%endif ; !HIGH_BIT_DEPTH
|
||||
2287
common/x86/dct-a.asm
Normal file
2287
common/x86/dct-a.asm
Normal file
File diff suppressed because it is too large
Load Diff
249
common/x86/dct.h
Normal file
249
common/x86/dct.h
Normal file
@@ -0,0 +1,249 @@
|
||||
/*****************************************************************************
|
||||
* dct.h: x86 transform and zigzag
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2003-2025 x264 project
|
||||
*
|
||||
* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
* Laurent Aimar <fenrir@via.ecp.fr>
|
||||
* Fiona Glaser <fiona@x264.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_X86_DCT_H
|
||||
#define X264_X86_DCT_H
|
||||
|
||||
#define x264_sub4x4_dct_mmx x264_template(sub4x4_dct_mmx)
|
||||
void x264_sub4x4_dct_mmx ( dctcoef dct [16], pixel *pix1, pixel *pix2 );
|
||||
#define x264_sub8x8_dct_mmx x264_template(sub8x8_dct_mmx)
|
||||
void x264_sub8x8_dct_mmx ( dctcoef dct[ 4][16], pixel *pix1, pixel *pix2 );
|
||||
#define x264_sub16x16_dct_mmx x264_template(sub16x16_dct_mmx)
|
||||
void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
|
||||
#define x264_sub8x8_dct_sse2 x264_template(sub8x8_dct_sse2)
|
||||
void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct_sse2 x264_template(sub16x16_dct_sse2)
|
||||
void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub4x4_dct_ssse3 x264_template(sub4x4_dct_ssse3)
|
||||
void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub4x4_dct_avx512 x264_template(sub4x4_dct_avx512)
|
||||
void x264_sub4x4_dct_avx512 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x8_dct_ssse3 x264_template(sub8x8_dct_ssse3)
|
||||
void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct_ssse3 x264_template(sub16x16_dct_ssse3)
|
||||
void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x8_dct_avx x264_template(sub8x8_dct_avx)
|
||||
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct_avx x264_template(sub16x16_dct_avx)
|
||||
void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x8_dct_xop x264_template(sub8x8_dct_xop)
|
||||
void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct_xop x264_template(sub16x16_dct_xop)
|
||||
void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x8_dct_avx2 x264_template(sub8x8_dct_avx2)
|
||||
void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x8_dct_avx512 x264_template(sub8x8_dct_avx512)
|
||||
void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct_avx2 x264_template(sub16x16_dct_avx2)
|
||||
void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct_avx512 x264_template(sub16x16_dct_avx512)
|
||||
void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x8_dct_dc_mmx2 x264_template(sub8x8_dct_dc_mmx2)
|
||||
void x264_sub8x8_dct_dc_mmx2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x8_dct_dc_sse2 x264_template(sub8x8_dct_dc_sse2)
|
||||
void x264_sub8x8_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
|
||||
#define x264_sub8x8_dct_dc_avx512 x264_template(sub8x8_dct_dc_avx512)
|
||||
void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x16_dct_dc_sse2 x264_template(sub8x16_dct_dc_sse2)
|
||||
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
|
||||
#define x264_sub8x16_dct_dc_ssse3 x264_template(sub8x16_dct_dc_ssse3)
|
||||
void x264_sub8x16_dct_dc_ssse3 ( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x16_dct_dc_avx x264_template(sub8x16_dct_dc_avx)
|
||||
void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
|
||||
#define x264_sub8x16_dct_dc_avx512 x264_template(sub8x16_dct_dc_avx512)
|
||||
void x264_sub8x16_dct_dc_avx512( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
|
||||
|
||||
#define x264_add4x4_idct_mmx x264_template(add4x4_idct_mmx)
|
||||
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
|
||||
#define x264_add4x4_idct_sse2 x264_template(add4x4_idct_sse2)
|
||||
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
|
||||
#define x264_add4x4_idct_sse4 x264_template(add4x4_idct_sse4)
|
||||
void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct [16] );
|
||||
#define x264_add4x4_idct_avx x264_template(add4x4_idct_avx)
|
||||
void x264_add4x4_idct_avx ( pixel *p_dst, dctcoef dct [16] );
|
||||
#define x264_add8x8_idct_mmx x264_template(add8x8_idct_mmx)
|
||||
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][16] );
|
||||
#define x264_add8x8_idct_dc_mmx2 x264_template(add8x8_idct_dc_mmx2)
|
||||
void x264_add8x8_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [ 4] );
|
||||
#define x264_add16x16_idct_mmx x264_template(add16x16_idct_mmx)
|
||||
void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][16] );
|
||||
#define x264_add16x16_idct_dc_mmx2 x264_template(add16x16_idct_dc_mmx2)
|
||||
void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] );
|
||||
#define x264_add8x8_idct_sse2 x264_template(add8x8_idct_sse2)
|
||||
void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] );
|
||||
#define x264_add8x8_idct_avx x264_template(add8x8_idct_avx)
|
||||
void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] );
|
||||
#define x264_add8x8_idct_avx2 x264_template(add8x8_idct_avx2)
|
||||
void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] );
|
||||
#define x264_add8x8_idct_avx512 x264_template(add8x8_idct_avx512)
|
||||
void x264_add8x8_idct_avx512 ( uint8_t *p_dst, int16_t dct[ 4][16] );
|
||||
#define x264_add16x16_idct_sse2 x264_template(add16x16_idct_sse2)
|
||||
void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] );
|
||||
#define x264_add16x16_idct_avx x264_template(add16x16_idct_avx)
|
||||
void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] );
|
||||
#define x264_add16x16_idct_avx2 x264_template(add16x16_idct_avx2)
|
||||
void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] );
|
||||
#define x264_add8x8_idct_dc_sse2 x264_template(add8x8_idct_dc_sse2)
|
||||
void x264_add8x8_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [ 4] );
|
||||
#define x264_add16x16_idct_dc_sse2 x264_template(add16x16_idct_dc_sse2)
|
||||
void x264_add16x16_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [16] );
|
||||
#define x264_add8x8_idct_dc_ssse3 x264_template(add8x8_idct_dc_ssse3)
|
||||
void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] );
|
||||
#define x264_add16x16_idct_dc_ssse3 x264_template(add16x16_idct_dc_ssse3)
|
||||
void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] );
|
||||
#define x264_add8x8_idct_dc_avx x264_template(add8x8_idct_dc_avx)
|
||||
void x264_add8x8_idct_dc_avx ( pixel *p_dst, dctcoef dct [ 4] );
|
||||
#define x264_add16x16_idct_dc_avx x264_template(add16x16_idct_dc_avx)
|
||||
void x264_add16x16_idct_dc_avx ( pixel *p_dst, dctcoef dct [16] );
|
||||
#define x264_add16x16_idct_dc_avx2 x264_template(add16x16_idct_dc_avx2)
|
||||
void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct [16] );
|
||||
|
||||
#define x264_dct4x4dc_mmx2 x264_template(dct4x4dc_mmx2)
|
||||
void x264_dct4x4dc_mmx2 ( int16_t d[16] );
|
||||
#define x264_dct4x4dc_sse2 x264_template(dct4x4dc_sse2)
|
||||
void x264_dct4x4dc_sse2 ( int32_t d[16] );
|
||||
#define x264_dct4x4dc_avx x264_template(dct4x4dc_avx)
|
||||
void x264_dct4x4dc_avx ( int32_t d[16] );
|
||||
#define x264_idct4x4dc_mmx x264_template(idct4x4dc_mmx)
|
||||
void x264_idct4x4dc_mmx ( int16_t d[16] );
|
||||
#define x264_idct4x4dc_sse2 x264_template(idct4x4dc_sse2)
|
||||
void x264_idct4x4dc_sse2 ( int32_t d[16] );
|
||||
#define x264_idct4x4dc_avx x264_template(idct4x4dc_avx)
|
||||
void x264_idct4x4dc_avx ( int32_t d[16] );
|
||||
|
||||
#define x264_dct2x4dc_mmx2 x264_template(dct2x4dc_mmx2)
|
||||
void x264_dct2x4dc_mmx2( dctcoef dct[8], dctcoef dct4x4[8][16] );
|
||||
#define x264_dct2x4dc_sse2 x264_template(dct2x4dc_sse2)
|
||||
void x264_dct2x4dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16] );
|
||||
#define x264_dct2x4dc_avx x264_template(dct2x4dc_avx)
|
||||
void x264_dct2x4dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16] );
|
||||
|
||||
#define x264_sub8x8_dct8_mmx x264_template(sub8x8_dct8_mmx)
|
||||
void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct8_mmx x264_template(sub16x16_dct8_mmx)
|
||||
void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x8_dct8_sse2 x264_template(sub8x8_dct8_sse2)
|
||||
void x264_sub8x8_dct8_sse2 ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
|
||||
#define x264_sub16x16_dct8_sse2 x264_template(sub16x16_dct8_sse2)
|
||||
void x264_sub16x16_dct8_sse2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
|
||||
#define x264_sub8x8_dct8_ssse3 x264_template(sub8x8_dct8_ssse3)
|
||||
void x264_sub8x8_dct8_ssse3 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub16x16_dct8_ssse3 x264_template(sub16x16_dct8_ssse3)
|
||||
void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
|
||||
#define x264_sub8x8_dct8_sse4 x264_template(sub8x8_dct8_sse4)
|
||||
void x264_sub8x8_dct8_sse4 ( int32_t dct [64], uint16_t *pix1, uint16_t *pix2 );
|
||||
#define x264_sub16x16_dct8_sse4 x264_template(sub16x16_dct8_sse4)
|
||||
void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 );
|
||||
#define x264_sub8x8_dct8_avx x264_template(sub8x8_dct8_avx)
|
||||
void x264_sub8x8_dct8_avx ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
|
||||
#define x264_sub16x16_dct8_avx x264_template(sub16x16_dct8_avx)
|
||||
void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
|
||||
#define x264_sub16x16_dct8_avx2 x264_template(sub16x16_dct8_avx2)
|
||||
void x264_sub16x16_dct8_avx2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
|
||||
|
||||
|
||||
#define x264_add8x8_idct8_mmx x264_template(add8x8_idct8_mmx)
|
||||
void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] );
|
||||
#define x264_add16x16_idct8_mmx x264_template(add16x16_idct8_mmx)
|
||||
void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] );
|
||||
#define x264_add8x8_idct8_sse2 x264_template(add8x8_idct8_sse2)
|
||||
void x264_add8x8_idct8_sse2 ( pixel *dst, dctcoef dct [64] );
|
||||
#define x264_add16x16_idct8_sse2 x264_template(add16x16_idct8_sse2)
|
||||
void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] );
|
||||
#define x264_add8x8_idct8_avx x264_template(add8x8_idct8_avx)
|
||||
void x264_add8x8_idct8_avx ( pixel *dst, dctcoef dct [64] );
|
||||
#define x264_add16x16_idct8_avx x264_template(add16x16_idct8_avx)
|
||||
void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] );
|
||||
|
||||
#define x264_zigzag_scan_8x8_frame_mmx2 x264_template(zigzag_scan_8x8_frame_mmx2)
|
||||
void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
|
||||
#define x264_zigzag_scan_8x8_frame_sse2 x264_template(zigzag_scan_8x8_frame_sse2)
|
||||
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
|
||||
#define x264_zigzag_scan_8x8_frame_ssse3 x264_template(zigzag_scan_8x8_frame_ssse3)
|
||||
void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
|
||||
#define x264_zigzag_scan_8x8_frame_avx x264_template(zigzag_scan_8x8_frame_avx)
|
||||
void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
|
||||
#define x264_zigzag_scan_8x8_frame_xop x264_template(zigzag_scan_8x8_frame_xop)
|
||||
void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] );
|
||||
#define x264_zigzag_scan_8x8_frame_avx512 x264_template(zigzag_scan_8x8_frame_avx512)
|
||||
void x264_zigzag_scan_8x8_frame_avx512( dctcoef level[64], dctcoef dct[64] );
|
||||
#define x264_zigzag_scan_4x4_frame_mmx x264_template(zigzag_scan_4x4_frame_mmx)
|
||||
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
|
||||
#define x264_zigzag_scan_4x4_frame_sse2 x264_template(zigzag_scan_4x4_frame_sse2)
|
||||
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
|
||||
#define x264_zigzag_scan_4x4_frame_ssse3 x264_template(zigzag_scan_4x4_frame_ssse3)
|
||||
void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
|
||||
#define x264_zigzag_scan_4x4_frame_avx x264_template(zigzag_scan_4x4_frame_avx)
|
||||
void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
|
||||
#define x264_zigzag_scan_4x4_frame_xop x264_template(zigzag_scan_4x4_frame_xop)
|
||||
void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
|
||||
#define x264_zigzag_scan_4x4_frame_avx512 x264_template(zigzag_scan_4x4_frame_avx512)
|
||||
void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] );
|
||||
#define x264_zigzag_scan_4x4_field_sse x264_template(zigzag_scan_4x4_field_sse)
|
||||
void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] );
|
||||
#define x264_zigzag_scan_4x4_field_sse2 x264_template(zigzag_scan_4x4_field_sse2)
|
||||
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
|
||||
#define x264_zigzag_scan_4x4_field_avx512 x264_template(zigzag_scan_4x4_field_avx512)
|
||||
void x264_zigzag_scan_4x4_field_avx512( dctcoef level[16], dctcoef dct[16] );
|
||||
#define x264_zigzag_scan_8x8_field_mmx2 x264_template(zigzag_scan_8x8_field_mmx2)
|
||||
void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
|
||||
#define x264_zigzag_scan_8x8_field_sse4 x264_template(zigzag_scan_8x8_field_sse4)
|
||||
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
|
||||
#define x264_zigzag_scan_8x8_field_avx x264_template(zigzag_scan_8x8_field_avx)
|
||||
void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
|
||||
#define x264_zigzag_scan_8x8_field_xop x264_template(zigzag_scan_8x8_field_xop)
|
||||
void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
|
||||
#define x264_zigzag_scan_8x8_field_avx512 x264_template(zigzag_scan_8x8_field_avx512)
|
||||
void x264_zigzag_scan_8x8_field_avx512( dctcoef level[64], dctcoef dct[64] );
|
||||
#define x264_zigzag_sub_4x4_frame_avx x264_template(zigzag_sub_4x4_frame_avx)
|
||||
int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
|
||||
#define x264_zigzag_sub_4x4_frame_ssse3 x264_template(zigzag_sub_4x4_frame_ssse3)
|
||||
int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
|
||||
#define x264_zigzag_sub_4x4ac_frame_avx x264_template(zigzag_sub_4x4ac_frame_avx)
|
||||
int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
|
||||
#define x264_zigzag_sub_4x4ac_frame_ssse3 x264_template(zigzag_sub_4x4ac_frame_ssse3)
|
||||
int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
|
||||
#define x264_zigzag_sub_4x4_field_avx x264_template(zigzag_sub_4x4_field_avx)
|
||||
int x264_zigzag_sub_4x4_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
|
||||
#define x264_zigzag_sub_4x4_field_ssse3 x264_template(zigzag_sub_4x4_field_ssse3)
|
||||
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
|
||||
#define x264_zigzag_sub_4x4ac_field_avx x264_template(zigzag_sub_4x4ac_field_avx)
|
||||
int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
|
||||
#define x264_zigzag_sub_4x4ac_field_ssse3 x264_template(zigzag_sub_4x4ac_field_ssse3)
|
||||
int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
|
||||
#define x264_zigzag_interleave_8x8_cavlc_mmx x264_template(zigzag_interleave_8x8_cavlc_mmx)
|
||||
void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
|
||||
#define x264_zigzag_interleave_8x8_cavlc_sse2 x264_template(zigzag_interleave_8x8_cavlc_sse2)
|
||||
void x264_zigzag_interleave_8x8_cavlc_sse2 ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
|
||||
#define x264_zigzag_interleave_8x8_cavlc_avx x264_template(zigzag_interleave_8x8_cavlc_avx)
|
||||
void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
|
||||
#define x264_zigzag_interleave_8x8_cavlc_avx2 x264_template(zigzag_interleave_8x8_cavlc_avx2)
|
||||
void x264_zigzag_interleave_8x8_cavlc_avx2 ( int16_t *dst, int16_t *src, uint8_t *nnz );
|
||||
#define x264_zigzag_interleave_8x8_cavlc_avx512 x264_template(zigzag_interleave_8x8_cavlc_avx512)
|
||||
void x264_zigzag_interleave_8x8_cavlc_avx512( dctcoef *dst, dctcoef *src, uint8_t *nnz );
|
||||
|
||||
#endif
|
||||
2548
common/x86/deblock-a.asm
Normal file
2548
common/x86/deblock-a.asm
Normal file
File diff suppressed because it is too large
Load Diff
146
common/x86/deblock.h
Normal file
146
common/x86/deblock.h
Normal file
@@ -0,0 +1,146 @@
|
||||
/*****************************************************************************
|
||||
* deblock.h: x86 deblocking
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2017-2025 x264 project
|
||||
*
|
||||
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_X86_DEBLOCK_H
|
||||
#define X264_X86_DEBLOCK_H
|
||||
|
||||
#define x264_deblock_v_luma_sse2 x264_template(deblock_v_luma_sse2)
|
||||
void x264_deblock_v_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_v_luma_avx x264_template(deblock_v_luma_avx)
|
||||
void x264_deblock_v_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_luma_sse2 x264_template(deblock_h_luma_sse2)
|
||||
void x264_deblock_h_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_luma_avx x264_template(deblock_h_luma_avx)
|
||||
void x264_deblock_h_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_v_chroma_sse2 x264_template(deblock_v_chroma_sse2)
|
||||
void x264_deblock_v_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_v_chroma_avx x264_template(deblock_v_chroma_avx)
|
||||
void x264_deblock_v_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_sse2 x264_template(deblock_h_chroma_sse2)
|
||||
void x264_deblock_h_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_avx x264_template(deblock_h_chroma_avx)
|
||||
void x264_deblock_h_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_mbaff_sse2 x264_template(deblock_h_chroma_mbaff_sse2)
|
||||
void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_mbaff_avx x264_template(deblock_h_chroma_mbaff_avx)
|
||||
void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_422_mmx2 x264_template(deblock_h_chroma_422_mmx2)
|
||||
void x264_deblock_h_chroma_422_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_422_sse2 x264_template(deblock_h_chroma_422_sse2)
|
||||
void x264_deblock_h_chroma_422_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_422_avx x264_template(deblock_h_chroma_422_avx)
|
||||
void x264_deblock_h_chroma_422_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_v_luma_intra_sse2 x264_template(deblock_v_luma_intra_sse2)
|
||||
void x264_deblock_v_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_v_luma_intra_avx x264_template(deblock_v_luma_intra_avx)
|
||||
void x264_deblock_v_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_luma_intra_sse2 x264_template(deblock_h_luma_intra_sse2)
|
||||
void x264_deblock_h_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_luma_intra_avx x264_template(deblock_h_luma_intra_avx)
|
||||
void x264_deblock_h_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_v_chroma_intra_sse2 x264_template(deblock_v_chroma_intra_sse2)
|
||||
void x264_deblock_v_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_v_chroma_intra_avx x264_template(deblock_v_chroma_intra_avx)
|
||||
void x264_deblock_v_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_intra_sse2 x264_template(deblock_h_chroma_intra_sse2)
|
||||
void x264_deblock_h_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_intra_avx x264_template(deblock_h_chroma_intra_avx)
|
||||
void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_422_intra_mmx2 x264_template(deblock_h_chroma_422_intra_mmx2)
|
||||
void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_422_intra_sse2 x264_template(deblock_h_chroma_422_intra_sse2)
|
||||
void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_422_intra_avx x264_template(deblock_h_chroma_422_intra_avx)
|
||||
void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_strength_sse2 x264_template(deblock_strength_sse2)
|
||||
void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
|
||||
int mvy_limit, int bframe );
|
||||
#define x264_deblock_strength_ssse3 x264_template(deblock_strength_ssse3)
|
||||
void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
|
||||
int mvy_limit, int bframe );
|
||||
#define x264_deblock_strength_avx x264_template(deblock_strength_avx)
|
||||
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
|
||||
int mvy_limit, int bframe );
|
||||
#define x264_deblock_strength_avx2 x264_template(deblock_strength_avx2)
|
||||
void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
|
||||
int mvy_limit, int bframe );
|
||||
#define x264_deblock_strength_avx512 x264_template(deblock_strength_avx512)
|
||||
void x264_deblock_strength_avx512( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
|
||||
int mvy_limit, int bframe );
|
||||
|
||||
#define x264_deblock_h_chroma_intra_mbaff_mmx2 x264_template(deblock_h_chroma_intra_mbaff_mmx2)
|
||||
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_intra_mbaff_sse2 x264_template(deblock_h_chroma_intra_mbaff_sse2)
|
||||
void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_intra_mbaff_avx x264_template(deblock_h_chroma_intra_mbaff_avx)
|
||||
void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#if ARCH_X86
|
||||
#define x264_deblock_h_luma_mmx2 x264_template(deblock_h_luma_mmx2)
|
||||
void x264_deblock_h_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_v8_luma_mmx2 x264_template(deblock_v8_luma_mmx2)
|
||||
void x264_deblock_v8_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_v_chroma_mmx2 x264_template(deblock_v_chroma_mmx2)
|
||||
void x264_deblock_v_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_mmx2 x264_template(deblock_h_chroma_mmx2)
|
||||
void x264_deblock_h_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_mbaff_mmx2 x264_template(deblock_h_chroma_mbaff_mmx2)
|
||||
void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_luma_intra_mmx2 x264_template(deblock_h_luma_intra_mmx2)
|
||||
void x264_deblock_h_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_v8_luma_intra_mmx2 x264_template(deblock_v8_luma_intra_mmx2)
|
||||
void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_v_chroma_intra_mmx2 x264_template(deblock_v_chroma_intra_mmx2)
|
||||
void x264_deblock_v_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_intra_mmx2 x264_template(deblock_h_chroma_intra_mmx2)
|
||||
void x264_deblock_h_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_v_chroma_intra_mbaff_mmx2 x264_template(deblock_v_chroma_intra_mbaff_mmx2)
|
||||
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
|
||||
#define x264_deblock_v_luma_mmx2 x264_template(deblock_v_luma_mmx2)
|
||||
#define x264_deblock_v_luma_intra_mmx2 x264_template(deblock_v_luma_intra_mmx2)
|
||||
#if HIGH_BIT_DEPTH
|
||||
void x264_deblock_v_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
void x264_deblock_v_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
|
||||
#else
|
||||
// FIXME this wrapper has a significant cpu cost
|
||||
static ALWAYS_INLINE void x264_deblock_v_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
|
||||
{
|
||||
x264_deblock_v8_luma_mmx2( pix, stride, alpha, beta, tc0 );
|
||||
x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 );
|
||||
}
|
||||
static ALWAYS_INLINE void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta )
|
||||
{
|
||||
x264_deblock_v8_luma_intra_mmx2( pix, stride, alpha, beta );
|
||||
x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta );
|
||||
}
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
#endif
|
||||
|
||||
#endif
|
||||
2226
common/x86/mc-a.asm
Normal file
2226
common/x86/mc-a.asm
Normal file
File diff suppressed because it is too large
Load Diff
2883
common/x86/mc-a2.asm
Normal file
2883
common/x86/mc-a2.asm
Normal file
File diff suppressed because it is too large
Load Diff
1143
common/x86/mc-c.c
Normal file
1143
common/x86/mc-c.c
Normal file
File diff suppressed because it is too large
Load Diff
33
common/x86/mc.h
Normal file
33
common/x86/mc.h
Normal file
@@ -0,0 +1,33 @@
|
||||
/*****************************************************************************
|
||||
* mc.h: x86 motion compensation
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2003-2025 x264 project
|
||||
*
|
||||
* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
* Laurent Aimar <fenrir@via.ecp.fr>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_X86_MC_H
|
||||
#define X264_X86_MC_H
|
||||
|
||||
#define x264_mc_init_mmx x264_template(mc_init_mmx)
|
||||
void x264_mc_init_mmx( uint32_t cpu, x264_mc_functions_t *pf );
|
||||
|
||||
#endif
|
||||
423
common/x86/pixel-32.asm
Normal file
423
common/x86/pixel-32.asm
Normal file
@@ -0,0 +1,423 @@
|
||||
;*****************************************************************************
|
||||
;* pixel-32.asm: x86_32 pixel metrics
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2003-2025 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;* Laurent Aimar <fenrir@via.ecp.fr>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at licensing@x264.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
%include "x86util.asm"
|
||||
|
||||
cextern pw_ppmmppmm
|
||||
cextern pw_pmpmpmpm
|
||||
|
||||
SECTION .text
|
||||
INIT_MMX mmx2
|
||||
|
||||
%if HIGH_BIT_DEPTH == 0
|
||||
|
||||
%macro LOAD_DIFF_4x8P 1 ; dx
|
||||
LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1]
|
||||
LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3]
|
||||
LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
|
||||
LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5]
|
||||
lea r0, [r0+4*r1]
|
||||
lea r2, [r2+4*r3]
|
||||
LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1]
|
||||
LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3]
|
||||
LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
|
||||
movq [spill], m5
|
||||
LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5]
|
||||
movq m5, [spill]
|
||||
%endmacro
|
||||
|
||||
%macro SUM4x8_MM 0
|
||||
movq [spill], m6
|
||||
movq [spill+8], m7
|
||||
ABSW2 m0, m1, m0, m1, m6, m7
|
||||
ABSW2 m2, m3, m2, m3, m6, m7
|
||||
paddw m0, m2
|
||||
paddw m1, m3
|
||||
movq m6, [spill]
|
||||
movq m7, [spill+8]
|
||||
ABSW2 m4, m5, m4, m5, m2, m3
|
||||
ABSW2 m6, m7, m6, m7, m2, m3
|
||||
paddw m4, m6
|
||||
paddw m5, m7
|
||||
paddw m0, m4
|
||||
paddw m1, m5
|
||||
paddw m0, m1
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal pixel_sa8d_8x8_internal
|
||||
push r0
|
||||
push r2
|
||||
sub esp, 0x74
|
||||
%define args esp+0x74
|
||||
%define spill esp+0x60 ; +16
|
||||
%define trans esp+0 ; +96
|
||||
LOAD_DIFF_4x8P 0
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movq [spill], m1
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 1
|
||||
movq [trans+0x00], m4
|
||||
movq [trans+0x08], m5
|
||||
movq [trans+0x10], m6
|
||||
movq [trans+0x18], m7
|
||||
movq m1, [spill]
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
movq [trans+0x20], m0
|
||||
movq [trans+0x28], m1
|
||||
movq [trans+0x30], m2
|
||||
movq [trans+0x38], m3
|
||||
|
||||
mov r0, [args+4]
|
||||
mov r2, [args]
|
||||
LOAD_DIFF_4x8P 4
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movq [spill], m7
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 7
|
||||
movq [trans+0x40], m0
|
||||
movq [trans+0x48], m1
|
||||
movq [trans+0x50], m2
|
||||
movq [trans+0x58], m3
|
||||
movq m7, [spill]
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 1
|
||||
movq m0, [trans+0x00]
|
||||
movq m1, [trans+0x08]
|
||||
movq m2, [trans+0x10]
|
||||
movq m3, [trans+0x18]
|
||||
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
SUM4x8_MM
|
||||
movq [trans], m0
|
||||
|
||||
movq m0, [trans+0x20]
|
||||
movq m1, [trans+0x28]
|
||||
movq m2, [trans+0x30]
|
||||
movq m3, [trans+0x38]
|
||||
movq m4, [trans+0x40]
|
||||
movq m5, [trans+0x48]
|
||||
movq m6, [trans+0x50]
|
||||
movq m7, [trans+0x58]
|
||||
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
SUM4x8_MM
|
||||
|
||||
pavgw m0, [trans]
|
||||
add esp, 0x7c
|
||||
ret
|
||||
%undef args
|
||||
%undef spill
|
||||
%undef trans
|
||||
|
||||
%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
|
||||
pxor %7, %7
|
||||
pshufw %4, %1, q1032
|
||||
pshufw %5, %2, q1032
|
||||
pshufw %6, %3, q1032
|
||||
paddusw %1, %4
|
||||
paddusw %2, %5
|
||||
paddusw %3, %6
|
||||
punpcklwd %1, %7
|
||||
punpcklwd %2, %7
|
||||
punpcklwd %3, %7
|
||||
pshufw %4, %1, q1032
|
||||
pshufw %5, %2, q1032
|
||||
pshufw %6, %3, q1032
|
||||
%8 %1, %4
|
||||
%8 %2, %5
|
||||
%8 %3, %6
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_4x8P 1 ; dx
|
||||
pxor m7, m7
|
||||
movd m6, [r0+%1+7*FENC_STRIDE]
|
||||
movd m0, [r0+%1+0*FENC_STRIDE]
|
||||
movd m1, [r0+%1+1*FENC_STRIDE]
|
||||
movd m2, [r0+%1+2*FENC_STRIDE]
|
||||
movd m3, [r0+%1+3*FENC_STRIDE]
|
||||
movd m4, [r0+%1+4*FENC_STRIDE]
|
||||
movd m5, [r0+%1+5*FENC_STRIDE]
|
||||
punpcklbw m6, m7
|
||||
punpcklbw m0, m7
|
||||
punpcklbw m1, m7
|
||||
movq [spill], m6
|
||||
punpcklbw m2, m7
|
||||
punpcklbw m3, m7
|
||||
movd m6, [r0+%1+6*FENC_STRIDE]
|
||||
punpcklbw m4, m7
|
||||
punpcklbw m5, m7
|
||||
punpcklbw m6, m7
|
||||
movq m7, [spill]
|
||||
%endmacro
|
||||
|
||||
%macro HSUMSUB2 4
|
||||
pshufw m4, %1, %3
|
||||
pshufw m5, %2, %3
|
||||
pmullw %1, %4
|
||||
pmullw m5, %4
|
||||
paddw %1, m4
|
||||
paddw %2, m5
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal intra_sa8d_x3_8x8, 2,3
|
||||
SUB esp, 0x94
|
||||
%define edge esp+0x70 ; +32
|
||||
%define spill esp+0x60 ; +16
|
||||
%define trans esp+0 ; +96
|
||||
%define sum esp+0 ; +32
|
||||
|
||||
pxor m7, m7
|
||||
movq m0, [r1+7]
|
||||
movq m2, [r1+16]
|
||||
movq m1, m0
|
||||
movq m3, m2
|
||||
punpcklbw m0, m7
|
||||
punpckhbw m1, m7
|
||||
punpcklbw m2, m7
|
||||
punpckhbw m3, m7
|
||||
movq m6, [pw_ppmmppmm]
|
||||
HSUMSUB2 m0, m2, q1032, m6
|
||||
HSUMSUB2 m1, m3, q1032, m6
|
||||
movq m6, [pw_pmpmpmpm]
|
||||
HSUMSUB2 m0, m2, q2301, m6
|
||||
HSUMSUB2 m1, m3, q2301, m6
|
||||
movq m4, m0
|
||||
movq m5, m2
|
||||
paddw m0, m1
|
||||
paddw m2, m3
|
||||
psubw m4, m1
|
||||
psubw m3, m5
|
||||
movq [edge+0], m0
|
||||
movq [edge+8], m4
|
||||
movq [edge+16], m2
|
||||
movq [edge+24], m3
|
||||
|
||||
LOAD_4x8P 0
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movq [spill], m0
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 0
|
||||
movq [trans+0x00], m4
|
||||
movq [trans+0x08], m5
|
||||
movq [trans+0x10], m6
|
||||
movq [trans+0x18], m7
|
||||
movq m0, [spill]
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 4
|
||||
movq [trans+0x20], m0
|
||||
movq [trans+0x28], m1
|
||||
movq [trans+0x30], m2
|
||||
movq [trans+0x38], m3
|
||||
|
||||
LOAD_4x8P 4
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movq [spill], m7
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 7
|
||||
movq [trans+0x40], m0
|
||||
movq [trans+0x48], m1
|
||||
movq [trans+0x50], m2
|
||||
movq [trans+0x58], m3
|
||||
movq m7, [spill]
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 0
|
||||
movq m0, [trans+0x00]
|
||||
movq m1, [trans+0x08]
|
||||
movq m2, [trans+0x10]
|
||||
movq m3, [trans+0x18]
|
||||
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movq [spill+0], m0
|
||||
movq [spill+8], m1
|
||||
ABSW2 m2, m3, m2, m3, m0, m1
|
||||
ABSW2 m4, m5, m4, m5, m0, m1
|
||||
paddw m2, m4
|
||||
paddw m3, m5
|
||||
ABSW2 m6, m7, m6, m7, m4, m5
|
||||
movq m0, [spill+0]
|
||||
movq m1, [spill+8]
|
||||
paddw m2, m6
|
||||
paddw m3, m7
|
||||
paddw m2, m3
|
||||
ABSW m1, m1, m4
|
||||
paddw m2, m1 ; 7x4 sum
|
||||
movq m7, m0
|
||||
movq m1, [edge+8] ; left bottom
|
||||
psllw m1, 3
|
||||
psubw m7, m1
|
||||
ABSW2 m0, m7, m0, m7, m5, m3
|
||||
paddw m0, m2
|
||||
paddw m7, m2
|
||||
movq [sum+0], m0 ; dc
|
||||
movq [sum+8], m7 ; left
|
||||
|
||||
movq m0, [trans+0x20]
|
||||
movq m1, [trans+0x28]
|
||||
movq m2, [trans+0x30]
|
||||
movq m3, [trans+0x38]
|
||||
movq m4, [trans+0x40]
|
||||
movq m5, [trans+0x48]
|
||||
movq m6, [trans+0x50]
|
||||
movq m7, [trans+0x58]
|
||||
|
||||
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
|
||||
|
||||
movd [sum+0x10], m0
|
||||
movd [sum+0x12], m1
|
||||
movd [sum+0x14], m2
|
||||
movd [sum+0x16], m3
|
||||
movd [sum+0x18], m4
|
||||
movd [sum+0x1a], m5
|
||||
movd [sum+0x1c], m6
|
||||
movd [sum+0x1e], m7
|
||||
|
||||
movq [spill], m0
|
||||
movq [spill+8], m1
|
||||
ABSW2 m2, m3, m2, m3, m0, m1
|
||||
ABSW2 m4, m5, m4, m5, m0, m1
|
||||
paddw m2, m4
|
||||
paddw m3, m5
|
||||
paddw m2, m3
|
||||
movq m0, [spill]
|
||||
movq m1, [spill+8]
|
||||
ABSW2 m6, m7, m6, m7, m4, m5
|
||||
ABSW m1, m1, m3
|
||||
paddw m2, m7
|
||||
paddw m1, m6
|
||||
paddw m2, m1 ; 7x4 sum
|
||||
movq m1, m0
|
||||
|
||||
movq m7, [edge+0]
|
||||
psllw m7, 3 ; left top
|
||||
|
||||
mov r2, [edge+0]
|
||||
add r2, [edge+16]
|
||||
lea r2, [4*r2+32]
|
||||
and r2, 0xffc0
|
||||
movd m6, r2 ; dc
|
||||
|
||||
psubw m1, m7
|
||||
psubw m0, m6
|
||||
ABSW2 m0, m1, m0, m1, m5, m6
|
||||
movq m3, [sum+0] ; dc
|
||||
paddw m0, m2
|
||||
paddw m1, m2
|
||||
movq m2, m0
|
||||
paddw m0, m3
|
||||
paddw m1, [sum+8] ; h
|
||||
psrlq m2, 16
|
||||
paddw m2, m3
|
||||
|
||||
movq m3, [edge+16] ; top left
|
||||
movq m4, [edge+24] ; top right
|
||||
psllw m3, 3
|
||||
psllw m4, 3
|
||||
psubw m3, [sum+16]
|
||||
psubw m4, [sum+24]
|
||||
ABSW2 m3, m4, m3, m4, m5, m6
|
||||
paddw m2, m3
|
||||
paddw m2, m4 ; v
|
||||
|
||||
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
|
||||
mov r2, r2m
|
||||
pxor m7, m7
|
||||
punpckldq m2, m1
|
||||
pavgw m0, m7
|
||||
pavgw m2, m7
|
||||
movd [r2+8], m0 ; dc
|
||||
movq [r2+0], m2 ; v, h
|
||||
ADD esp, 0x94
|
||||
RET
|
||||
%undef edge
|
||||
%undef spill
|
||||
%undef trans
|
||||
%undef sum
|
||||
|
||||
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
|
||||
; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
|
||||
;-----------------------------------------------------------------------------
|
||||
cglobal pixel_ssim_4x4x2_core, 0,5
|
||||
mov r1, r1m
|
||||
mov r3, r3m
|
||||
mov r4, 4
|
||||
pxor m0, m0
|
||||
.loop:
|
||||
mov r0, r0m
|
||||
mov r2, r2m
|
||||
add r0, r4
|
||||
add r2, r4
|
||||
pxor m1, m1
|
||||
pxor m2, m2
|
||||
pxor m3, m3
|
||||
pxor m4, m4
|
||||
%rep 4
|
||||
movd m5, [r0]
|
||||
movd m6, [r2]
|
||||
punpcklbw m5, m0
|
||||
punpcklbw m6, m0
|
||||
paddw m1, m5
|
||||
paddw m2, m6
|
||||
movq m7, m5
|
||||
pmaddwd m5, m5
|
||||
pmaddwd m7, m6
|
||||
pmaddwd m6, m6
|
||||
paddd m3, m5
|
||||
paddd m4, m7
|
||||
paddd m3, m6
|
||||
add r0, r1
|
||||
add r2, r3
|
||||
%endrep
|
||||
mov r0, r4m
|
||||
lea r0, [r0+r4*4]
|
||||
pshufw m5, m1, q0032
|
||||
pshufw m6, m2, q0032
|
||||
paddusw m1, m5
|
||||
paddusw m2, m6
|
||||
punpcklwd m1, m2
|
||||
pshufw m2, m1, q0032
|
||||
pshufw m5, m3, q0032
|
||||
pshufw m6, m4, q0032
|
||||
paddusw m1, m2
|
||||
paddd m3, m5
|
||||
paddd m4, m6
|
||||
punpcklwd m1, m0
|
||||
punpckldq m3, m4
|
||||
movq [r0+0], m1
|
||||
movq [r0+8], m3
|
||||
sub r4, 4
|
||||
jge .loop
|
||||
emms
|
||||
RET
|
||||
|
||||
%endif ; !HIGH_BIT_DEPTH
|
||||
5851
common/x86/pixel-a.asm
Normal file
5851
common/x86/pixel-a.asm
Normal file
File diff suppressed because it is too large
Load Diff
623
common/x86/pixel.h
Normal file
623
common/x86/pixel.h
Normal file
@@ -0,0 +1,623 @@
|
||||
/*****************************************************************************
|
||||
* pixel.h: x86 pixel metrics
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2003-2025 x264 project
|
||||
*
|
||||
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||
* Loren Merritt <lorenm@u.washington.edu>
|
||||
* Fiona Glaser <fiona@x264.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_X86_PIXEL_H
|
||||
#define X264_X86_PIXEL_H
|
||||
|
||||
#define x264_pixel_ads1_avx x264_template(pixel_ads1_avx)
|
||||
#define x264_pixel_ads1_avx2 x264_template(pixel_ads1_avx2)
|
||||
#define x264_pixel_ads1_mmx2 x264_template(pixel_ads1_mmx2)
|
||||
#define x264_pixel_ads1_sse2 x264_template(pixel_ads1_sse2)
|
||||
#define x264_pixel_ads1_ssse3 x264_template(pixel_ads1_ssse3)
|
||||
#define x264_pixel_ads2_avx x264_template(pixel_ads2_avx)
|
||||
#define x264_pixel_ads2_avx2 x264_template(pixel_ads2_avx2)
|
||||
#define x264_pixel_ads2_mmx2 x264_template(pixel_ads2_mmx2)
|
||||
#define x264_pixel_ads2_sse2 x264_template(pixel_ads2_sse2)
|
||||
#define x264_pixel_ads2_ssse3 x264_template(pixel_ads2_ssse3)
|
||||
#define x264_pixel_ads4_avx x264_template(pixel_ads4_avx)
|
||||
#define x264_pixel_ads4_avx2 x264_template(pixel_ads4_avx2)
|
||||
#define x264_pixel_ads4_mmx2 x264_template(pixel_ads4_mmx2)
|
||||
#define x264_pixel_ads4_sse2 x264_template(pixel_ads4_sse2)
|
||||
#define x264_pixel_ads4_ssse3 x264_template(pixel_ads4_ssse3)
|
||||
#define x264_pixel_hadamard_ac_16x16_avx x264_template(pixel_hadamard_ac_16x16_avx)
|
||||
#define x264_pixel_hadamard_ac_16x16_avx2 x264_template(pixel_hadamard_ac_16x16_avx2)
|
||||
#define x264_pixel_hadamard_ac_16x16_mmx2 x264_template(pixel_hadamard_ac_16x16_mmx2)
|
||||
#define x264_pixel_hadamard_ac_16x16_sse2 x264_template(pixel_hadamard_ac_16x16_sse2)
|
||||
#define x264_pixel_hadamard_ac_16x16_sse4 x264_template(pixel_hadamard_ac_16x16_sse4)
|
||||
#define x264_pixel_hadamard_ac_16x16_ssse3 x264_template(pixel_hadamard_ac_16x16_ssse3)
|
||||
#define x264_pixel_hadamard_ac_16x16_ssse3_atom x264_template(pixel_hadamard_ac_16x16_ssse3_atom)
|
||||
#define x264_pixel_hadamard_ac_16x16_xop x264_template(pixel_hadamard_ac_16x16_xop)
|
||||
#define x264_pixel_hadamard_ac_16x8_avx x264_template(pixel_hadamard_ac_16x8_avx)
|
||||
#define x264_pixel_hadamard_ac_16x8_avx2 x264_template(pixel_hadamard_ac_16x8_avx2)
|
||||
#define x264_pixel_hadamard_ac_16x8_mmx2 x264_template(pixel_hadamard_ac_16x8_mmx2)
|
||||
#define x264_pixel_hadamard_ac_16x8_sse2 x264_template(pixel_hadamard_ac_16x8_sse2)
|
||||
#define x264_pixel_hadamard_ac_16x8_sse4 x264_template(pixel_hadamard_ac_16x8_sse4)
|
||||
#define x264_pixel_hadamard_ac_16x8_ssse3 x264_template(pixel_hadamard_ac_16x8_ssse3)
|
||||
#define x264_pixel_hadamard_ac_16x8_ssse3_atom x264_template(pixel_hadamard_ac_16x8_ssse3_atom)
|
||||
#define x264_pixel_hadamard_ac_16x8_xop x264_template(pixel_hadamard_ac_16x8_xop)
|
||||
#define x264_pixel_hadamard_ac_8x16_avx x264_template(pixel_hadamard_ac_8x16_avx)
|
||||
#define x264_pixel_hadamard_ac_8x16_mmx2 x264_template(pixel_hadamard_ac_8x16_mmx2)
|
||||
#define x264_pixel_hadamard_ac_8x16_sse2 x264_template(pixel_hadamard_ac_8x16_sse2)
|
||||
#define x264_pixel_hadamard_ac_8x16_sse4 x264_template(pixel_hadamard_ac_8x16_sse4)
|
||||
#define x264_pixel_hadamard_ac_8x16_ssse3 x264_template(pixel_hadamard_ac_8x16_ssse3)
|
||||
#define x264_pixel_hadamard_ac_8x16_ssse3_atom x264_template(pixel_hadamard_ac_8x16_ssse3_atom)
|
||||
#define x264_pixel_hadamard_ac_8x16_xop x264_template(pixel_hadamard_ac_8x16_xop)
|
||||
#define x264_pixel_hadamard_ac_8x8_avx x264_template(pixel_hadamard_ac_8x8_avx)
|
||||
#define x264_pixel_hadamard_ac_8x8_mmx2 x264_template(pixel_hadamard_ac_8x8_mmx2)
|
||||
#define x264_pixel_hadamard_ac_8x8_sse2 x264_template(pixel_hadamard_ac_8x8_sse2)
|
||||
#define x264_pixel_hadamard_ac_8x8_sse4 x264_template(pixel_hadamard_ac_8x8_sse4)
|
||||
#define x264_pixel_hadamard_ac_8x8_ssse3 x264_template(pixel_hadamard_ac_8x8_ssse3)
|
||||
#define x264_pixel_hadamard_ac_8x8_ssse3_atom x264_template(pixel_hadamard_ac_8x8_ssse3_atom)
|
||||
#define x264_pixel_hadamard_ac_8x8_xop x264_template(pixel_hadamard_ac_8x8_xop)
|
||||
#define x264_pixel_sa8d_16x16_mmx2 x264_template(pixel_sa8d_16x16_mmx2)
|
||||
#define x264_pixel_sa8d_16x16_avx x264_template(pixel_sa8d_16x16_avx)
|
||||
#define x264_pixel_sa8d_16x16_sse2 x264_template(pixel_sa8d_16x16_sse2)
|
||||
#define x264_pixel_sa8d_16x16_sse4 x264_template(pixel_sa8d_16x16_sse4)
|
||||
#define x264_pixel_sa8d_16x16_ssse3 x264_template(pixel_sa8d_16x16_ssse3)
|
||||
#define x264_pixel_sa8d_16x16_ssse3_atom x264_template(pixel_sa8d_16x16_ssse3_atom)
|
||||
#define x264_pixel_sa8d_16x16_xop x264_template(pixel_sa8d_16x16_xop)
|
||||
#define x264_pixel_sa8d_8x8_mmx2 x264_template(pixel_sa8d_8x8_mmx2)
|
||||
#define x264_pixel_sa8d_8x8_avx x264_template(pixel_sa8d_8x8_avx)
|
||||
#define x264_pixel_sa8d_8x8_avx2 x264_template(pixel_sa8d_8x8_avx2)
|
||||
#define x264_pixel_sa8d_8x8_avx512 x264_template(pixel_sa8d_8x8_avx512)
|
||||
#define x264_pixel_sa8d_8x8_sse2 x264_template(pixel_sa8d_8x8_sse2)
|
||||
#define x264_pixel_sa8d_8x8_sse4 x264_template(pixel_sa8d_8x8_sse4)
|
||||
#define x264_pixel_sa8d_8x8_ssse3 x264_template(pixel_sa8d_8x8_ssse3)
|
||||
#define x264_pixel_sa8d_8x8_ssse3_atom x264_template(pixel_sa8d_8x8_ssse3_atom)
|
||||
#define x264_pixel_sa8d_8x8_xop x264_template(pixel_sa8d_8x8_xop)
|
||||
#define x264_pixel_sad_16x16_avx2 x264_template(pixel_sad_16x16_avx2)
|
||||
#define x264_pixel_sad_16x16_avx512 x264_template(pixel_sad_16x16_avx512)
|
||||
#define x264_pixel_sad_16x16_cache32_mmx2 x264_template(pixel_sad_16x16_cache32_mmx2)
|
||||
#define x264_pixel_sad_16x16_cache64_mmx2 x264_template(pixel_sad_16x16_cache64_mmx2)
|
||||
#define x264_pixel_sad_16x16_cache64_sse2 x264_template(pixel_sad_16x16_cache64_sse2)
|
||||
#define x264_pixel_sad_16x16_cache64_ssse3 x264_template(pixel_sad_16x16_cache64_ssse3)
|
||||
#define x264_pixel_sad_16x16_mmx2 x264_template(pixel_sad_16x16_mmx2)
|
||||
#define x264_pixel_sad_16x16_sse2 x264_template(pixel_sad_16x16_sse2)
|
||||
#define x264_pixel_sad_16x16_sse2_aligned x264_template(pixel_sad_16x16_sse2_aligned)
|
||||
#define x264_pixel_sad_16x16_sse3 x264_template(pixel_sad_16x16_sse3)
|
||||
#define x264_pixel_sad_16x16_ssse3 x264_template(pixel_sad_16x16_ssse3)
|
||||
#define x264_pixel_sad_16x16_ssse3_aligned x264_template(pixel_sad_16x16_ssse3_aligned)
|
||||
#define x264_pixel_sad_16x8_avx2 x264_template(pixel_sad_16x8_avx2)
|
||||
#define x264_pixel_sad_16x8_avx512 x264_template(pixel_sad_16x8_avx512)
|
||||
#define x264_pixel_sad_16x8_cache32_mmx2 x264_template(pixel_sad_16x8_cache32_mmx2)
|
||||
#define x264_pixel_sad_16x8_cache64_mmx2 x264_template(pixel_sad_16x8_cache64_mmx2)
|
||||
#define x264_pixel_sad_16x8_cache64_sse2 x264_template(pixel_sad_16x8_cache64_sse2)
|
||||
#define x264_pixel_sad_16x8_cache64_ssse3 x264_template(pixel_sad_16x8_cache64_ssse3)
|
||||
#define x264_pixel_sad_16x8_mmx2 x264_template(pixel_sad_16x8_mmx2)
|
||||
#define x264_pixel_sad_16x8_sse2 x264_template(pixel_sad_16x8_sse2)
|
||||
#define x264_pixel_sad_16x8_sse2_aligned x264_template(pixel_sad_16x8_sse2_aligned)
|
||||
#define x264_pixel_sad_16x8_sse3 x264_template(pixel_sad_16x8_sse3)
|
||||
#define x264_pixel_sad_16x8_ssse3 x264_template(pixel_sad_16x8_ssse3)
|
||||
#define x264_pixel_sad_16x8_ssse3_aligned x264_template(pixel_sad_16x8_ssse3_aligned)
|
||||
#define x264_pixel_sad_4x16_avx512 x264_template(pixel_sad_4x16_avx512)
|
||||
#define x264_pixel_sad_4x16_mmx2 x264_template(pixel_sad_4x16_mmx2)
|
||||
#define x264_pixel_sad_4x4_avx512 x264_template(pixel_sad_4x4_avx512)
|
||||
#define x264_pixel_sad_4x4_mmx2 x264_template(pixel_sad_4x4_mmx2)
|
||||
#define x264_pixel_sad_4x4_ssse3 x264_template(pixel_sad_4x4_ssse3)
|
||||
#define x264_pixel_sad_4x8_avx512 x264_template(pixel_sad_4x8_avx512)
|
||||
#define x264_pixel_sad_4x8_mmx2 x264_template(pixel_sad_4x8_mmx2)
|
||||
#define x264_pixel_sad_4x8_ssse3 x264_template(pixel_sad_4x8_ssse3)
|
||||
#define x264_pixel_sad_8x16_avx512 x264_template(pixel_sad_8x16_avx512)
|
||||
#define x264_pixel_sad_8x16_cache32_mmx2 x264_template(pixel_sad_8x16_cache32_mmx2)
|
||||
#define x264_pixel_sad_8x16_cache64_mmx2 x264_template(pixel_sad_8x16_cache64_mmx2)
|
||||
#define x264_pixel_sad_8x16_mmx2 x264_template(pixel_sad_8x16_mmx2)
|
||||
#define x264_pixel_sad_8x16_sse2 x264_template(pixel_sad_8x16_sse2)
|
||||
#define x264_pixel_sad_8x16_sse2_aligned x264_template(pixel_sad_8x16_sse2_aligned)
|
||||
#define x264_pixel_sad_8x16_ssse3 x264_template(pixel_sad_8x16_ssse3)
|
||||
#define x264_pixel_sad_8x16_ssse3_aligned x264_template(pixel_sad_8x16_ssse3_aligned)
|
||||
#define x264_pixel_sad_8x4_avx512 x264_template(pixel_sad_8x4_avx512)
|
||||
#define x264_pixel_sad_8x4_cache32_mmx2 x264_template(pixel_sad_8x4_cache32_mmx2)
|
||||
#define x264_pixel_sad_8x4_cache64_mmx2 x264_template(pixel_sad_8x4_cache64_mmx2)
|
||||
#define x264_pixel_sad_8x4_mmx2 x264_template(pixel_sad_8x4_mmx2)
|
||||
#define x264_pixel_sad_8x4_sse2 x264_template(pixel_sad_8x4_sse2)
|
||||
#define x264_pixel_sad_8x4_ssse3 x264_template(pixel_sad_8x4_ssse3)
|
||||
#define x264_pixel_sad_8x8_avx512 x264_template(pixel_sad_8x8_avx512)
|
||||
#define x264_pixel_sad_8x8_cache32_mmx2 x264_template(pixel_sad_8x8_cache32_mmx2)
|
||||
#define x264_pixel_sad_8x8_cache64_mmx2 x264_template(pixel_sad_8x8_cache64_mmx2)
|
||||
#define x264_pixel_sad_8x8_mmx2 x264_template(pixel_sad_8x8_mmx2)
|
||||
#define x264_pixel_sad_8x8_sse2 x264_template(pixel_sad_8x8_sse2)
|
||||
#define x264_pixel_sad_8x8_sse2_aligned x264_template(pixel_sad_8x8_sse2_aligned)
|
||||
#define x264_pixel_sad_8x8_ssse3 x264_template(pixel_sad_8x8_ssse3)
|
||||
#define x264_pixel_sad_8x8_ssse3_aligned x264_template(pixel_sad_8x8_ssse3_aligned)
|
||||
#define x264_pixel_sad_x3_16x16_avx x264_template(pixel_sad_x3_16x16_avx)
|
||||
#define x264_pixel_sad_x3_16x16_avx2 x264_template(pixel_sad_x3_16x16_avx2)
|
||||
#define x264_pixel_sad_x3_16x16_avx512 x264_template(pixel_sad_x3_16x16_avx512)
|
||||
#define x264_pixel_sad_x3_16x16_cache32_mmx2 x264_template(pixel_sad_x3_16x16_cache32_mmx2)
|
||||
#define x264_pixel_sad_x3_16x16_cache64_mmx2 x264_template(pixel_sad_x3_16x16_cache64_mmx2)
|
||||
#define x264_pixel_sad_x3_16x16_cache64_sse2 x264_template(pixel_sad_x3_16x16_cache64_sse2)
|
||||
#define x264_pixel_sad_x3_16x16_cache64_ssse3 x264_template(pixel_sad_x3_16x16_cache64_ssse3)
|
||||
#define x264_pixel_sad_x3_16x16_mmx2 x264_template(pixel_sad_x3_16x16_mmx2)
|
||||
#define x264_pixel_sad_x3_16x16_sse2 x264_template(pixel_sad_x3_16x16_sse2)
|
||||
#define x264_pixel_sad_x3_16x16_sse3 x264_template(pixel_sad_x3_16x16_sse3)
|
||||
#define x264_pixel_sad_x3_16x16_ssse3 x264_template(pixel_sad_x3_16x16_ssse3)
|
||||
#define x264_pixel_sad_x3_16x16_xop x264_template(pixel_sad_x3_16x16_xop)
|
||||
#define x264_pixel_sad_x3_16x8_avx x264_template(pixel_sad_x3_16x8_avx)
|
||||
#define x264_pixel_sad_x3_16x8_avx2 x264_template(pixel_sad_x3_16x8_avx2)
|
||||
#define x264_pixel_sad_x3_16x8_avx512 x264_template(pixel_sad_x3_16x8_avx512)
|
||||
#define x264_pixel_sad_x3_16x8_cache32_mmx2 x264_template(pixel_sad_x3_16x8_cache32_mmx2)
|
||||
#define x264_pixel_sad_x3_16x8_cache64_mmx2 x264_template(pixel_sad_x3_16x8_cache64_mmx2)
|
||||
#define x264_pixel_sad_x3_16x8_cache64_sse2 x264_template(pixel_sad_x3_16x8_cache64_sse2)
|
||||
#define x264_pixel_sad_x3_16x8_cache64_ssse3 x264_template(pixel_sad_x3_16x8_cache64_ssse3)
|
||||
#define x264_pixel_sad_x3_16x8_mmx2 x264_template(pixel_sad_x3_16x8_mmx2)
|
||||
#define x264_pixel_sad_x3_16x8_sse2 x264_template(pixel_sad_x3_16x8_sse2)
|
||||
#define x264_pixel_sad_x3_16x8_sse3 x264_template(pixel_sad_x3_16x8_sse3)
|
||||
#define x264_pixel_sad_x3_16x8_ssse3 x264_template(pixel_sad_x3_16x8_ssse3)
|
||||
#define x264_pixel_sad_x3_16x8_xop x264_template(pixel_sad_x3_16x8_xop)
|
||||
#define x264_pixel_sad_x3_4x4_avx512 x264_template(pixel_sad_x3_4x4_avx512)
|
||||
#define x264_pixel_sad_x3_4x4_mmx2 x264_template(pixel_sad_x3_4x4_mmx2)
|
||||
#define x264_pixel_sad_x3_4x4_ssse3 x264_template(pixel_sad_x3_4x4_ssse3)
|
||||
#define x264_pixel_sad_x3_4x8_avx512 x264_template(pixel_sad_x3_4x8_avx512)
|
||||
#define x264_pixel_sad_x3_4x8_mmx2 x264_template(pixel_sad_x3_4x8_mmx2)
|
||||
#define x264_pixel_sad_x3_4x8_ssse3 x264_template(pixel_sad_x3_4x8_ssse3)
|
||||
#define x264_pixel_sad_x3_8x16_avx512 x264_template(pixel_sad_x3_8x16_avx512)
|
||||
#define x264_pixel_sad_x3_8x16_cache32_mmx2 x264_template(pixel_sad_x3_8x16_cache32_mmx2)
|
||||
#define x264_pixel_sad_x3_8x16_cache64_mmx2 x264_template(pixel_sad_x3_8x16_cache64_mmx2)
|
||||
#define x264_pixel_sad_x3_8x16_cache64_sse2 x264_template(pixel_sad_x3_8x16_cache64_sse2)
|
||||
#define x264_pixel_sad_x3_8x16_mmx2 x264_template(pixel_sad_x3_8x16_mmx2)
|
||||
#define x264_pixel_sad_x3_8x16_sse2 x264_template(pixel_sad_x3_8x16_sse2)
|
||||
#define x264_pixel_sad_x3_8x16_ssse3 x264_template(pixel_sad_x3_8x16_ssse3)
|
||||
#define x264_pixel_sad_x3_8x16_xop x264_template(pixel_sad_x3_8x16_xop)
|
||||
#define x264_pixel_sad_x3_8x4_avx512 x264_template(pixel_sad_x3_8x4_avx512)
|
||||
#define x264_pixel_sad_x3_8x4_mmx2 x264_template(pixel_sad_x3_8x4_mmx2)
|
||||
#define x264_pixel_sad_x3_8x4_sse2 x264_template(pixel_sad_x3_8x4_sse2)
|
||||
#define x264_pixel_sad_x3_8x4_ssse3 x264_template(pixel_sad_x3_8x4_ssse3)
|
||||
#define x264_pixel_sad_x3_8x4_xop x264_template(pixel_sad_x3_8x4_xop)
|
||||
#define x264_pixel_sad_x3_8x8_avx512 x264_template(pixel_sad_x3_8x8_avx512)
|
||||
#define x264_pixel_sad_x3_8x8_cache32_mmx2 x264_template(pixel_sad_x3_8x8_cache32_mmx2)
|
||||
#define x264_pixel_sad_x3_8x8_cache64_mmx2 x264_template(pixel_sad_x3_8x8_cache64_mmx2)
|
||||
#define x264_pixel_sad_x3_8x8_mmx2 x264_template(pixel_sad_x3_8x8_mmx2)
|
||||
#define x264_pixel_sad_x3_8x8_sse2 x264_template(pixel_sad_x3_8x8_sse2)
|
||||
#define x264_pixel_sad_x3_8x8_ssse3 x264_template(pixel_sad_x3_8x8_ssse3)
|
||||
#define x264_pixel_sad_x3_8x8_xop x264_template(pixel_sad_x3_8x8_xop)
|
||||
#define x264_pixel_sad_x4_16x16_avx x264_template(pixel_sad_x4_16x16_avx)
|
||||
#define x264_pixel_sad_x4_16x16_avx2 x264_template(pixel_sad_x4_16x16_avx2)
|
||||
#define x264_pixel_sad_x4_16x16_avx512 x264_template(pixel_sad_x4_16x16_avx512)
|
||||
#define x264_pixel_sad_x4_16x16_cache32_mmx2 x264_template(pixel_sad_x4_16x16_cache32_mmx2)
|
||||
#define x264_pixel_sad_x4_16x16_cache64_mmx2 x264_template(pixel_sad_x4_16x16_cache64_mmx2)
|
||||
#define x264_pixel_sad_x4_16x16_cache64_sse2 x264_template(pixel_sad_x4_16x16_cache64_sse2)
|
||||
#define x264_pixel_sad_x4_16x16_cache64_ssse3 x264_template(pixel_sad_x4_16x16_cache64_ssse3)
|
||||
#define x264_pixel_sad_x4_16x16_mmx2 x264_template(pixel_sad_x4_16x16_mmx2)
|
||||
#define x264_pixel_sad_x4_16x16_sse2 x264_template(pixel_sad_x4_16x16_sse2)
|
||||
#define x264_pixel_sad_x4_16x16_sse3 x264_template(pixel_sad_x4_16x16_sse3)
|
||||
#define x264_pixel_sad_x4_16x16_ssse3 x264_template(pixel_sad_x4_16x16_ssse3)
|
||||
#define x264_pixel_sad_x4_16x16_xop x264_template(pixel_sad_x4_16x16_xop)
|
||||
#define x264_pixel_sad_x4_16x8_avx x264_template(pixel_sad_x4_16x8_avx)
|
||||
#define x264_pixel_sad_x4_16x8_avx2 x264_template(pixel_sad_x4_16x8_avx2)
|
||||
#define x264_pixel_sad_x4_16x8_avx512 x264_template(pixel_sad_x4_16x8_avx512)
|
||||
#define x264_pixel_sad_x4_16x8_cache32_mmx2 x264_template(pixel_sad_x4_16x8_cache32_mmx2)
|
||||
#define x264_pixel_sad_x4_16x8_cache64_mmx2 x264_template(pixel_sad_x4_16x8_cache64_mmx2)
|
||||
#define x264_pixel_sad_x4_16x8_cache64_sse2 x264_template(pixel_sad_x4_16x8_cache64_sse2)
|
||||
#define x264_pixel_sad_x4_16x8_cache64_ssse3 x264_template(pixel_sad_x4_16x8_cache64_ssse3)
|
||||
#define x264_pixel_sad_x4_16x8_mmx2 x264_template(pixel_sad_x4_16x8_mmx2)
|
||||
#define x264_pixel_sad_x4_16x8_sse2 x264_template(pixel_sad_x4_16x8_sse2)
|
||||
#define x264_pixel_sad_x4_16x8_sse3 x264_template(pixel_sad_x4_16x8_sse3)
|
||||
#define x264_pixel_sad_x4_16x8_ssse3 x264_template(pixel_sad_x4_16x8_ssse3)
|
||||
#define x264_pixel_sad_x4_16x8_xop x264_template(pixel_sad_x4_16x8_xop)
|
||||
#define x264_pixel_sad_x4_4x4_avx512 x264_template(pixel_sad_x4_4x4_avx512)
|
||||
#define x264_pixel_sad_x4_4x4_mmx2 x264_template(pixel_sad_x4_4x4_mmx2)
|
||||
#define x264_pixel_sad_x4_4x4_ssse3 x264_template(pixel_sad_x4_4x4_ssse3)
|
||||
#define x264_pixel_sad_x4_4x8_avx512 x264_template(pixel_sad_x4_4x8_avx512)
|
||||
#define x264_pixel_sad_x4_4x8_mmx2 x264_template(pixel_sad_x4_4x8_mmx2)
|
||||
#define x264_pixel_sad_x4_4x8_ssse3 x264_template(pixel_sad_x4_4x8_ssse3)
|
||||
#define x264_pixel_sad_x4_8x16_avx512 x264_template(pixel_sad_x4_8x16_avx512)
|
||||
#define x264_pixel_sad_x4_8x16_cache32_mmx2 x264_template(pixel_sad_x4_8x16_cache32_mmx2)
|
||||
#define x264_pixel_sad_x4_8x16_cache64_mmx2 x264_template(pixel_sad_x4_8x16_cache64_mmx2)
|
||||
#define x264_pixel_sad_x4_8x16_cache64_sse2 x264_template(pixel_sad_x4_8x16_cache64_sse2)
|
||||
#define x264_pixel_sad_x4_8x16_mmx2 x264_template(pixel_sad_x4_8x16_mmx2)
|
||||
#define x264_pixel_sad_x4_8x16_sse2 x264_template(pixel_sad_x4_8x16_sse2)
|
||||
#define x264_pixel_sad_x4_8x16_ssse3 x264_template(pixel_sad_x4_8x16_ssse3)
|
||||
#define x264_pixel_sad_x4_8x16_xop x264_template(pixel_sad_x4_8x16_xop)
|
||||
#define x264_pixel_sad_x4_8x4_avx512 x264_template(pixel_sad_x4_8x4_avx512)
|
||||
#define x264_pixel_sad_x4_8x4_mmx2 x264_template(pixel_sad_x4_8x4_mmx2)
|
||||
#define x264_pixel_sad_x4_8x4_sse2 x264_template(pixel_sad_x4_8x4_sse2)
|
||||
#define x264_pixel_sad_x4_8x4_ssse3 x264_template(pixel_sad_x4_8x4_ssse3)
|
||||
#define x264_pixel_sad_x4_8x4_xop x264_template(pixel_sad_x4_8x4_xop)
|
||||
#define x264_pixel_sad_x4_8x8_avx512 x264_template(pixel_sad_x4_8x8_avx512)
|
||||
#define x264_pixel_sad_x4_8x8_cache32_mmx2 x264_template(pixel_sad_x4_8x8_cache32_mmx2)
|
||||
#define x264_pixel_sad_x4_8x8_cache64_mmx2 x264_template(pixel_sad_x4_8x8_cache64_mmx2)
|
||||
#define x264_pixel_sad_x4_8x8_mmx2 x264_template(pixel_sad_x4_8x8_mmx2)
|
||||
#define x264_pixel_sad_x4_8x8_sse2 x264_template(pixel_sad_x4_8x8_sse2)
|
||||
#define x264_pixel_sad_x4_8x8_ssse3 x264_template(pixel_sad_x4_8x8_ssse3)
|
||||
#define x264_pixel_sad_x4_8x8_xop x264_template(pixel_sad_x4_8x8_xop)
|
||||
#define x264_pixel_satd_16x16_avx x264_template(pixel_satd_16x16_avx)
|
||||
#define x264_pixel_satd_16x16_avx2 x264_template(pixel_satd_16x16_avx2)
|
||||
#define x264_pixel_satd_16x16_avx512 x264_template(pixel_satd_16x16_avx512)
|
||||
#define x264_pixel_satd_16x16_mmx2 x264_template(pixel_satd_16x16_mmx2)
|
||||
#define x264_pixel_satd_16x16_sse2 x264_template(pixel_satd_16x16_sse2)
|
||||
#define x264_pixel_satd_16x16_sse4 x264_template(pixel_satd_16x16_sse4)
|
||||
#define x264_pixel_satd_16x16_ssse3 x264_template(pixel_satd_16x16_ssse3)
|
||||
#define x264_pixel_satd_16x16_ssse3_atom x264_template(pixel_satd_16x16_ssse3_atom)
|
||||
#define x264_pixel_satd_16x16_xop x264_template(pixel_satd_16x16_xop)
|
||||
#define x264_pixel_satd_16x8_avx x264_template(pixel_satd_16x8_avx)
|
||||
#define x264_pixel_satd_16x8_avx2 x264_template(pixel_satd_16x8_avx2)
|
||||
#define x264_pixel_satd_16x8_avx512 x264_template(pixel_satd_16x8_avx512)
|
||||
#define x264_pixel_satd_16x8_mmx2 x264_template(pixel_satd_16x8_mmx2)
|
||||
#define x264_pixel_satd_16x8_sse2 x264_template(pixel_satd_16x8_sse2)
|
||||
#define x264_pixel_satd_16x8_sse4 x264_template(pixel_satd_16x8_sse4)
|
||||
#define x264_pixel_satd_16x8_ssse3 x264_template(pixel_satd_16x8_ssse3)
|
||||
#define x264_pixel_satd_16x8_ssse3_atom x264_template(pixel_satd_16x8_ssse3_atom)
|
||||
#define x264_pixel_satd_16x8_xop x264_template(pixel_satd_16x8_xop)
|
||||
#define x264_pixel_satd_4x16_avx x264_template(pixel_satd_4x16_avx)
|
||||
#define x264_pixel_satd_4x16_avx512 x264_template(pixel_satd_4x16_avx512)
|
||||
#define x264_pixel_satd_4x16_mmx2 x264_template(pixel_satd_4x16_mmx2)
|
||||
#define x264_pixel_satd_4x16_sse2 x264_template(pixel_satd_4x16_sse2)
|
||||
#define x264_pixel_satd_4x16_sse4 x264_template(pixel_satd_4x16_sse4)
|
||||
#define x264_pixel_satd_4x16_ssse3 x264_template(pixel_satd_4x16_ssse3)
|
||||
#define x264_pixel_satd_4x16_ssse3_atom x264_template(pixel_satd_4x16_ssse3_atom)
|
||||
#define x264_pixel_satd_4x4_avx x264_template(pixel_satd_4x4_avx)
|
||||
#define x264_pixel_satd_4x4_avx512 x264_template(pixel_satd_4x4_avx512)
|
||||
#define x264_pixel_satd_4x4_mmx2 x264_template(pixel_satd_4x4_mmx2)
|
||||
#define x264_pixel_satd_4x4_sse4 x264_template(pixel_satd_4x4_sse4)
|
||||
#define x264_pixel_satd_4x4_ssse3 x264_template(pixel_satd_4x4_ssse3)
|
||||
#define x264_pixel_satd_4x4_xop x264_template(pixel_satd_4x4_xop)
|
||||
#define x264_pixel_satd_4x8_avx x264_template(pixel_satd_4x8_avx)
|
||||
#define x264_pixel_satd_4x8_avx512 x264_template(pixel_satd_4x8_avx512)
|
||||
#define x264_pixel_satd_4x8_mmx2 x264_template(pixel_satd_4x8_mmx2)
|
||||
#define x264_pixel_satd_4x8_sse2 x264_template(pixel_satd_4x8_sse2)
|
||||
#define x264_pixel_satd_4x8_sse4 x264_template(pixel_satd_4x8_sse4)
|
||||
#define x264_pixel_satd_4x8_ssse3 x264_template(pixel_satd_4x8_ssse3)
|
||||
#define x264_pixel_satd_4x8_ssse3_atom x264_template(pixel_satd_4x8_ssse3_atom)
|
||||
#define x264_pixel_satd_4x8_xop x264_template(pixel_satd_4x8_xop)
|
||||
#define x264_pixel_satd_8x16_avx x264_template(pixel_satd_8x16_avx)
|
||||
#define x264_pixel_satd_8x16_avx2 x264_template(pixel_satd_8x16_avx2)
|
||||
#define x264_pixel_satd_8x16_avx512 x264_template(pixel_satd_8x16_avx512)
|
||||
#define x264_pixel_satd_8x16_mmx2 x264_template(pixel_satd_8x16_mmx2)
|
||||
#define x264_pixel_satd_8x16_sse2 x264_template(pixel_satd_8x16_sse2)
|
||||
#define x264_pixel_satd_8x16_sse4 x264_template(pixel_satd_8x16_sse4)
|
||||
#define x264_pixel_satd_8x16_ssse3 x264_template(pixel_satd_8x16_ssse3)
|
||||
#define x264_pixel_satd_8x16_ssse3_atom x264_template(pixel_satd_8x16_ssse3_atom)
|
||||
#define x264_pixel_satd_8x16_xop x264_template(pixel_satd_8x16_xop)
|
||||
#define x264_pixel_satd_8x4_avx x264_template(pixel_satd_8x4_avx)
|
||||
#define x264_pixel_satd_8x4_avx512 x264_template(pixel_satd_8x4_avx512)
|
||||
#define x264_pixel_satd_8x4_mmx2 x264_template(pixel_satd_8x4_mmx2)
|
||||
#define x264_pixel_satd_8x4_sse2 x264_template(pixel_satd_8x4_sse2)
|
||||
#define x264_pixel_satd_8x4_sse4 x264_template(pixel_satd_8x4_sse4)
|
||||
#define x264_pixel_satd_8x4_ssse3 x264_template(pixel_satd_8x4_ssse3)
|
||||
#define x264_pixel_satd_8x4_ssse3_atom x264_template(pixel_satd_8x4_ssse3_atom)
|
||||
#define x264_pixel_satd_8x4_xop x264_template(pixel_satd_8x4_xop)
|
||||
#define x264_pixel_satd_8x8_avx x264_template(pixel_satd_8x8_avx)
|
||||
#define x264_pixel_satd_8x8_avx2 x264_template(pixel_satd_8x8_avx2)
|
||||
#define x264_pixel_satd_8x8_avx512 x264_template(pixel_satd_8x8_avx512)
|
||||
#define x264_pixel_satd_8x8_mmx2 x264_template(pixel_satd_8x8_mmx2)
|
||||
#define x264_pixel_satd_8x8_sse2 x264_template(pixel_satd_8x8_sse2)
|
||||
#define x264_pixel_satd_8x8_sse4 x264_template(pixel_satd_8x8_sse4)
|
||||
#define x264_pixel_satd_8x8_ssse3 x264_template(pixel_satd_8x8_ssse3)
|
||||
#define x264_pixel_satd_8x8_ssse3_atom x264_template(pixel_satd_8x8_ssse3_atom)
|
||||
#define x264_pixel_satd_8x8_xop x264_template(pixel_satd_8x8_xop)
|
||||
#define x264_pixel_ssd_16x16_avx x264_template(pixel_ssd_16x16_avx)
|
||||
#define x264_pixel_ssd_16x16_avx2 x264_template(pixel_ssd_16x16_avx2)
|
||||
#define x264_pixel_ssd_16x16_mmx x264_template(pixel_ssd_16x16_mmx)
|
||||
#define x264_pixel_ssd_16x16_mmx2 x264_template(pixel_ssd_16x16_mmx2)
|
||||
#define x264_pixel_ssd_16x16_sse2 x264_template(pixel_ssd_16x16_sse2)
|
||||
#define x264_pixel_ssd_16x16_sse2slow x264_template(pixel_ssd_16x16_sse2slow)
|
||||
#define x264_pixel_ssd_16x16_ssse3 x264_template(pixel_ssd_16x16_ssse3)
|
||||
#define x264_pixel_ssd_16x16_xop x264_template(pixel_ssd_16x16_xop)
|
||||
#define x264_pixel_ssd_16x8_avx x264_template(pixel_ssd_16x8_avx)
|
||||
#define x264_pixel_ssd_16x8_avx2 x264_template(pixel_ssd_16x8_avx2)
|
||||
#define x264_pixel_ssd_16x8_mmx x264_template(pixel_ssd_16x8_mmx)
|
||||
#define x264_pixel_ssd_16x8_mmx2 x264_template(pixel_ssd_16x8_mmx2)
|
||||
#define x264_pixel_ssd_16x8_sse2 x264_template(pixel_ssd_16x8_sse2)
|
||||
#define x264_pixel_ssd_16x8_sse2slow x264_template(pixel_ssd_16x8_sse2slow)
|
||||
#define x264_pixel_ssd_16x8_ssse3 x264_template(pixel_ssd_16x8_ssse3)
|
||||
#define x264_pixel_ssd_16x8_xop x264_template(pixel_ssd_16x8_xop)
|
||||
#define x264_pixel_ssd_4x16_mmx x264_template(pixel_ssd_4x16_mmx)
|
||||
#define x264_pixel_ssd_4x16_mmx2 x264_template(pixel_ssd_4x16_mmx2)
|
||||
#define x264_pixel_ssd_4x16_ssse3 x264_template(pixel_ssd_4x16_ssse3)
|
||||
#define x264_pixel_ssd_4x4_mmx x264_template(pixel_ssd_4x4_mmx)
|
||||
#define x264_pixel_ssd_4x4_mmx2 x264_template(pixel_ssd_4x4_mmx2)
|
||||
#define x264_pixel_ssd_4x4_ssse3 x264_template(pixel_ssd_4x4_ssse3)
|
||||
#define x264_pixel_ssd_4x8_mmx x264_template(pixel_ssd_4x8_mmx)
|
||||
#define x264_pixel_ssd_4x8_mmx2 x264_template(pixel_ssd_4x8_mmx2)
|
||||
#define x264_pixel_ssd_4x8_ssse3 x264_template(pixel_ssd_4x8_ssse3)
|
||||
#define x264_pixel_ssd_8x16_avx x264_template(pixel_ssd_8x16_avx)
|
||||
#define x264_pixel_ssd_8x16_mmx x264_template(pixel_ssd_8x16_mmx)
|
||||
#define x264_pixel_ssd_8x16_mmx2 x264_template(pixel_ssd_8x16_mmx2)
|
||||
#define x264_pixel_ssd_8x16_sse2 x264_template(pixel_ssd_8x16_sse2)
|
||||
#define x264_pixel_ssd_8x16_sse2slow x264_template(pixel_ssd_8x16_sse2slow)
|
||||
#define x264_pixel_ssd_8x16_ssse3 x264_template(pixel_ssd_8x16_ssse3)
|
||||
#define x264_pixel_ssd_8x16_xop x264_template(pixel_ssd_8x16_xop)
|
||||
#define x264_pixel_ssd_8x4_avx x264_template(pixel_ssd_8x4_avx)
|
||||
#define x264_pixel_ssd_8x4_mmx x264_template(pixel_ssd_8x4_mmx)
|
||||
#define x264_pixel_ssd_8x4_mmx2 x264_template(pixel_ssd_8x4_mmx2)
|
||||
#define x264_pixel_ssd_8x4_sse2 x264_template(pixel_ssd_8x4_sse2)
|
||||
#define x264_pixel_ssd_8x4_sse2slow x264_template(pixel_ssd_8x4_sse2slow)
|
||||
#define x264_pixel_ssd_8x4_ssse3 x264_template(pixel_ssd_8x4_ssse3)
|
||||
#define x264_pixel_ssd_8x4_xop x264_template(pixel_ssd_8x4_xop)
|
||||
#define x264_pixel_ssd_8x8_avx x264_template(pixel_ssd_8x8_avx)
|
||||
#define x264_pixel_ssd_8x8_mmx x264_template(pixel_ssd_8x8_mmx)
|
||||
#define x264_pixel_ssd_8x8_mmx2 x264_template(pixel_ssd_8x8_mmx2)
|
||||
#define x264_pixel_ssd_8x8_sse2 x264_template(pixel_ssd_8x8_sse2)
|
||||
#define x264_pixel_ssd_8x8_sse2slow x264_template(pixel_ssd_8x8_sse2slow)
|
||||
#define x264_pixel_ssd_8x8_ssse3 x264_template(pixel_ssd_8x8_ssse3)
|
||||
#define x264_pixel_ssd_8x8_xop x264_template(pixel_ssd_8x8_xop)
|
||||
#define x264_pixel_var_16x16_avx x264_template(pixel_var_16x16_avx)
|
||||
#define x264_pixel_var_16x16_avx2 x264_template(pixel_var_16x16_avx2)
|
||||
#define x264_pixel_var_16x16_avx512 x264_template(pixel_var_16x16_avx512)
|
||||
#define x264_pixel_var_16x16_sse2 x264_template(pixel_var_16x16_sse2)
|
||||
#define x264_pixel_var_8x16_avx x264_template(pixel_var_8x16_avx)
|
||||
#define x264_pixel_var_8x16_avx512 x264_template(pixel_var_8x16_avx512)
|
||||
#define x264_pixel_var_8x16_sse2 x264_template(pixel_var_8x16_sse2)
|
||||
#define x264_pixel_var_8x8_avx x264_template(pixel_var_8x8_avx)
|
||||
#define x264_pixel_var_8x8_avx512 x264_template(pixel_var_8x8_avx512)
|
||||
#define x264_pixel_var_8x8_sse2 x264_template(pixel_var_8x8_sse2)
|
||||
#define DECL_PIXELS( ret, name, suffix, args ) \
|
||||
ret x264_pixel_##name##_16x16_##suffix args;\
|
||||
ret x264_pixel_##name##_16x8_##suffix args;\
|
||||
ret x264_pixel_##name##_8x16_##suffix args;\
|
||||
ret x264_pixel_##name##_8x8_##suffix args;\
|
||||
ret x264_pixel_##name##_8x4_##suffix args;\
|
||||
ret x264_pixel_##name##_4x16_##suffix args;\
|
||||
ret x264_pixel_##name##_4x8_##suffix args;\
|
||||
ret x264_pixel_##name##_4x4_##suffix args;\
|
||||
|
||||
#define DECL_X1( name, suffix ) \
|
||||
DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
|
||||
|
||||
#define DECL_X4( name, suffix ) \
|
||||
DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
|
||||
DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
|
||||
|
||||
DECL_X1( sad, mmx2 )
|
||||
DECL_X1( sad, sse2 )
|
||||
DECL_X1( sad, sse3 )
|
||||
DECL_X1( sad, sse2_aligned )
|
||||
DECL_X1( sad, ssse3 )
|
||||
DECL_X1( sad, ssse3_aligned )
|
||||
DECL_X1( sad, avx2 )
|
||||
DECL_X1( sad, avx512 )
|
||||
DECL_X4( sad, mmx2 )
|
||||
DECL_X4( sad, sse2 )
|
||||
DECL_X4( sad, sse3 )
|
||||
DECL_X4( sad, ssse3 )
|
||||
DECL_X4( sad, xop )
|
||||
DECL_X4( sad, avx )
|
||||
DECL_X4( sad, avx2 )
|
||||
DECL_X4( sad, avx512 )
|
||||
DECL_X1( ssd, mmx )
|
||||
DECL_X1( ssd, mmx2 )
|
||||
DECL_X1( ssd, sse2slow )
|
||||
DECL_X1( ssd, sse2 )
|
||||
DECL_X1( ssd, ssse3 )
|
||||
DECL_X1( ssd, avx )
|
||||
DECL_X1( ssd, xop )
|
||||
DECL_X1( ssd, avx2 )
|
||||
DECL_X1( satd, mmx2 )
|
||||
DECL_X1( satd, sse2 )
|
||||
DECL_X1( satd, ssse3 )
|
||||
DECL_X1( satd, ssse3_atom )
|
||||
DECL_X1( satd, sse4 )
|
||||
DECL_X1( satd, avx )
|
||||
DECL_X1( satd, xop )
|
||||
DECL_X1( satd, avx2 )
|
||||
DECL_X1( satd, avx512 )
|
||||
DECL_X1( sa8d, mmx2 )
|
||||
DECL_X1( sa8d, sse2 )
|
||||
DECL_X1( sa8d, ssse3 )
|
||||
DECL_X1( sa8d, ssse3_atom )
|
||||
DECL_X1( sa8d, sse4 )
|
||||
DECL_X1( sa8d, avx )
|
||||
DECL_X1( sa8d, xop )
|
||||
DECL_X1( sa8d, avx2 )
|
||||
DECL_X1( sa8d, avx512 )
|
||||
DECL_X1( sad, cache32_mmx2 );
|
||||
DECL_X1( sad, cache64_mmx2 );
|
||||
DECL_X1( sad, cache64_sse2 );
|
||||
DECL_X1( sad, cache64_ssse3 );
|
||||
DECL_X4( sad, cache32_mmx2 );
|
||||
DECL_X4( sad, cache64_mmx2 );
|
||||
DECL_X4( sad, cache64_sse2 );
|
||||
DECL_X4( sad, cache64_ssse3 );
|
||||
|
||||
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
|
||||
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
|
||||
DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
|
||||
DECL_PIXELS( uint64_t, var, avx512, ( pixel *pix, intptr_t i_stride ))
|
||||
DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride ))
|
||||
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride ))
|
||||
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
|
||||
DECL_PIXELS( uint64_t, hadamard_ac, ssse3_atom, ( pixel *pix, intptr_t i_stride ))
|
||||
DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, intptr_t i_stride ))
|
||||
DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, intptr_t i_stride ))
|
||||
DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, intptr_t i_stride ))
|
||||
DECL_PIXELS( uint64_t, hadamard_ac, avx2, ( pixel *pix, intptr_t i_stride ))
|
||||
|
||||
|
||||
#define x264_intra_satd_x3_4x4_mmx2 x264_template(intra_satd_x3_4x4_mmx2)
|
||||
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
|
||||
#define x264_intra_sad_x3_4x4_mmx2 x264_template(intra_sad_x3_4x4_mmx2)
|
||||
void x264_intra_sad_x3_4x4_mmx2 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sad_x3_4x4_sse2 x264_template(intra_sad_x3_4x4_sse2)
|
||||
void x264_intra_sad_x3_4x4_sse2 ( uint16_t*, uint16_t*, int * );
|
||||
#define x264_intra_sad_x3_4x4_ssse3 x264_template(intra_sad_x3_4x4_ssse3)
|
||||
void x264_intra_sad_x3_4x4_ssse3 ( uint16_t*, uint16_t*, int * );
|
||||
#define x264_intra_sad_x3_4x4_avx x264_template(intra_sad_x3_4x4_avx)
|
||||
void x264_intra_sad_x3_4x4_avx ( uint16_t*, uint16_t*, int * );
|
||||
#define x264_intra_satd_x3_8x8c_mmx2 x264_template(intra_satd_x3_8x8c_mmx2)
|
||||
void x264_intra_satd_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
|
||||
#define x264_intra_satd_x3_8x8c_ssse3 x264_template(intra_satd_x3_8x8c_ssse3)
|
||||
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sad_x3_8x8c_mmx2 x264_template(intra_sad_x3_8x8c_mmx2)
|
||||
void x264_intra_sad_x3_8x8c_mmx2 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sad_x3_8x8c_ssse3 x264_template(intra_sad_x3_8x8c_ssse3)
|
||||
void x264_intra_sad_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sad_x3_8x8c_avx2 x264_template(intra_sad_x3_8x8c_avx2)
|
||||
void x264_intra_sad_x3_8x8c_avx2 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_satd_x3_16x16_mmx2 x264_template(intra_satd_x3_16x16_mmx2)
|
||||
void x264_intra_satd_x3_16x16_mmx2 ( pixel *, pixel *, int * );
|
||||
#define x264_intra_satd_x3_16x16_ssse3 x264_template(intra_satd_x3_16x16_ssse3)
|
||||
void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sad_x3_16x16_mmx2 x264_template(intra_sad_x3_16x16_mmx2)
|
||||
void x264_intra_sad_x3_16x16_mmx2 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sad_x3_16x16_sse2 x264_template(intra_sad_x3_16x16_sse2)
|
||||
void x264_intra_sad_x3_16x16_sse2 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sad_x3_16x16_ssse3 x264_template(intra_sad_x3_16x16_ssse3)
|
||||
void x264_intra_sad_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sad_x3_16x16_avx2 x264_template(intra_sad_x3_16x16_avx2)
|
||||
void x264_intra_sad_x3_16x16_avx2 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sa8d_x3_8x8_mmx2 x264_template(intra_sa8d_x3_8x8_mmx2)
|
||||
void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sa8d_x3_8x8_sse2 x264_template(intra_sa8d_x3_8x8_sse2)
|
||||
void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sad_x3_8x8_mmx2 x264_template(intra_sad_x3_8x8_mmx2)
|
||||
void x264_intra_sad_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
|
||||
#define x264_intra_sad_x3_8x8_sse2 x264_template(intra_sad_x3_8x8_sse2)
|
||||
void x264_intra_sad_x3_8x8_sse2 ( uint16_t*, uint16_t*, int * );
|
||||
#define x264_intra_sad_x3_8x8_ssse3 x264_template(intra_sad_x3_8x8_ssse3)
|
||||
void x264_intra_sad_x3_8x8_ssse3 ( uint16_t*, uint16_t*, int * );
|
||||
#define x264_intra_sad_x3_8x8_avx2 x264_template(intra_sad_x3_8x8_avx2)
|
||||
void x264_intra_sad_x3_8x8_avx2 ( uint16_t*, uint16_t*, int * );
|
||||
#define x264_intra_satd_x9_4x4_ssse3 x264_template(intra_satd_x9_4x4_ssse3)
|
||||
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
|
||||
#define x264_intra_satd_x9_4x4_sse4 x264_template(intra_satd_x9_4x4_sse4)
|
||||
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
|
||||
#define x264_intra_satd_x9_4x4_avx x264_template(intra_satd_x9_4x4_avx)
|
||||
int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
|
||||
#define x264_intra_satd_x9_4x4_xop x264_template(intra_satd_x9_4x4_xop)
|
||||
int x264_intra_satd_x9_4x4_xop ( uint8_t *, uint8_t *, uint16_t * );
|
||||
#define x264_intra_sad_x9_4x4_ssse3 x264_template(intra_sad_x9_4x4_ssse3)
|
||||
int x264_intra_sad_x9_4x4_ssse3 ( uint8_t *, uint8_t *, uint16_t * );
|
||||
#define x264_intra_sad_x9_4x4_sse4 x264_template(intra_sad_x9_4x4_sse4)
|
||||
int x264_intra_sad_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
|
||||
#define x264_intra_sad_x9_4x4_avx x264_template(intra_sad_x9_4x4_avx)
|
||||
int x264_intra_sad_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
|
||||
#define x264_intra_sa8d_x9_8x8_ssse3 x264_template(intra_sa8d_x9_8x8_ssse3)
|
||||
int x264_intra_sa8d_x9_8x8_ssse3( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
|
||||
#define x264_intra_sa8d_x9_8x8_sse4 x264_template(intra_sa8d_x9_8x8_sse4)
|
||||
int x264_intra_sa8d_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
|
||||
#define x264_intra_sa8d_x9_8x8_avx x264_template(intra_sa8d_x9_8x8_avx)
|
||||
int x264_intra_sa8d_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
|
||||
#define x264_intra_sad_x9_8x8_ssse3 x264_template(intra_sad_x9_8x8_ssse3)
|
||||
int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
|
||||
#define x264_intra_sad_x9_8x8_sse4 x264_template(intra_sad_x9_8x8_sse4)
|
||||
int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
|
||||
#define x264_intra_sad_x9_8x8_avx x264_template(intra_sad_x9_8x8_avx)
|
||||
int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
|
||||
#define x264_intra_sad_x9_8x8_avx2 x264_template(intra_sad_x9_8x8_avx2)
|
||||
int x264_intra_sad_x9_8x8_avx2 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
|
||||
|
||||
#define x264_pixel_ssd_nv12_core_sse2 x264_template(pixel_ssd_nv12_core_sse2)
|
||||
void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1,
|
||||
pixel *pixuv2, intptr_t stride2, int width,
|
||||
int height, uint64_t *ssd_u, uint64_t *ssd_v );
|
||||
#define x264_pixel_ssd_nv12_core_avx x264_template(pixel_ssd_nv12_core_avx)
|
||||
void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1,
|
||||
pixel *pixuv2, intptr_t stride2, int width,
|
||||
int height, uint64_t *ssd_u, uint64_t *ssd_v );
|
||||
#define x264_pixel_ssd_nv12_core_xop x264_template(pixel_ssd_nv12_core_xop)
|
||||
void x264_pixel_ssd_nv12_core_xop ( pixel *pixuv1, intptr_t stride1,
|
||||
pixel *pixuv2, intptr_t stride2, int width,
|
||||
int height, uint64_t *ssd_u, uint64_t *ssd_v );
|
||||
#define x264_pixel_ssd_nv12_core_avx2 x264_template(pixel_ssd_nv12_core_avx2)
|
||||
void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1,
|
||||
pixel *pixuv2, intptr_t stride2, int width,
|
||||
int height, uint64_t *ssd_u, uint64_t *ssd_v );
|
||||
#define x264_pixel_ssim_4x4x2_core_mmx2 x264_template(pixel_ssim_4x4x2_core_mmx2)
|
||||
void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, intptr_t stride1,
|
||||
const uint8_t *pix2, intptr_t stride2, int sums[2][4] );
|
||||
#define x264_pixel_ssim_4x4x2_core_sse2 x264_template(pixel_ssim_4x4x2_core_sse2)
|
||||
void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, intptr_t stride1,
|
||||
const pixel *pix2, intptr_t stride2, int sums[2][4] );
|
||||
#define x264_pixel_ssim_4x4x2_core_avx x264_template(pixel_ssim_4x4x2_core_avx)
|
||||
void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1,
|
||||
const pixel *pix2, intptr_t stride2, int sums[2][4] );
|
||||
#define x264_pixel_ssim_end4_sse2 x264_template(pixel_ssim_end4_sse2)
|
||||
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
|
||||
#define x264_pixel_ssim_end4_avx x264_template(pixel_ssim_end4_avx)
|
||||
float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
|
||||
#define x264_pixel_var2_8x8_sse2 x264_template(pixel_var2_8x8_sse2)
|
||||
int x264_pixel_var2_8x8_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
|
||||
#define x264_pixel_var2_8x8_ssse3 x264_template(pixel_var2_8x8_ssse3)
|
||||
int x264_pixel_var2_8x8_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
|
||||
#define x264_pixel_var2_8x8_avx2 x264_template(pixel_var2_8x8_avx2)
|
||||
int x264_pixel_var2_8x8_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
|
||||
#define x264_pixel_var2_8x8_avx512 x264_template(pixel_var2_8x8_avx512)
|
||||
int x264_pixel_var2_8x8_avx512 ( pixel *fenc, pixel *fdec, int ssd[2] );
|
||||
#define x264_pixel_var2_8x16_sse2 x264_template(pixel_var2_8x16_sse2)
|
||||
int x264_pixel_var2_8x16_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
|
||||
#define x264_pixel_var2_8x16_ssse3 x264_template(pixel_var2_8x16_ssse3)
|
||||
int x264_pixel_var2_8x16_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
|
||||
#define x264_pixel_var2_8x16_avx2 x264_template(pixel_var2_8x16_avx2)
|
||||
int x264_pixel_var2_8x16_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
|
||||
#define x264_pixel_var2_8x16_avx512 x264_template(pixel_var2_8x16_avx512)
|
||||
int x264_pixel_var2_8x16_avx512( pixel *fenc, pixel *fdec, int ssd[2] );
|
||||
#define x264_pixel_vsad_mmx2 x264_template(pixel_vsad_mmx2)
|
||||
int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
|
||||
#define x264_pixel_vsad_sse2 x264_template(pixel_vsad_sse2)
|
||||
int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
|
||||
#define x264_pixel_vsad_ssse3 x264_template(pixel_vsad_ssse3)
|
||||
int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
|
||||
#define x264_pixel_vsad_xop x264_template(pixel_vsad_xop)
|
||||
int x264_pixel_vsad_xop ( pixel *src, intptr_t stride, int height );
|
||||
#define x264_pixel_vsad_avx2 x264_template(pixel_vsad_avx2)
|
||||
int x264_pixel_vsad_avx2 ( uint16_t *src, intptr_t stride, int height );
|
||||
#define x264_pixel_asd8_sse2 x264_template(pixel_asd8_sse2)
|
||||
int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
|
||||
#define x264_pixel_asd8_ssse3 x264_template(pixel_asd8_ssse3)
|
||||
int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
|
||||
#define x264_pixel_asd8_xop x264_template(pixel_asd8_xop)
|
||||
int x264_pixel_asd8_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
|
||||
#define x264_pixel_sa8d_satd_16x16_sse2 x264_template(pixel_sa8d_satd_16x16_sse2)
|
||||
uint64_t x264_pixel_sa8d_satd_16x16_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
|
||||
#define x264_pixel_sa8d_satd_16x16_ssse3 x264_template(pixel_sa8d_satd_16x16_ssse3)
|
||||
uint64_t x264_pixel_sa8d_satd_16x16_ssse3 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
|
||||
#define x264_pixel_sa8d_satd_16x16_ssse3_atom x264_template(pixel_sa8d_satd_16x16_ssse3_atom)
|
||||
uint64_t x264_pixel_sa8d_satd_16x16_ssse3_atom( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
|
||||
#define x264_pixel_sa8d_satd_16x16_sse4 x264_template(pixel_sa8d_satd_16x16_sse4)
|
||||
uint64_t x264_pixel_sa8d_satd_16x16_sse4 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
|
||||
#define x264_pixel_sa8d_satd_16x16_avx x264_template(pixel_sa8d_satd_16x16_avx)
|
||||
uint64_t x264_pixel_sa8d_satd_16x16_avx ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
|
||||
#define x264_pixel_sa8d_satd_16x16_xop x264_template(pixel_sa8d_satd_16x16_xop)
|
||||
uint64_t x264_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
|
||||
#define x264_pixel_sa8d_satd_16x16_avx2 x264_template(pixel_sa8d_satd_16x16_avx2)
|
||||
uint64_t x264_pixel_sa8d_satd_16x16_avx2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
|
||||
|
||||
|
||||
#define DECL_ADS( size, suffix ) \
|
||||
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
|
||||
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
|
||||
DECL_ADS( 4, mmx2 )
|
||||
DECL_ADS( 2, mmx2 )
|
||||
DECL_ADS( 1, mmx2 )
|
||||
DECL_ADS( 4, sse2 )
|
||||
DECL_ADS( 2, sse2 )
|
||||
DECL_ADS( 1, sse2 )
|
||||
DECL_ADS( 4, ssse3 )
|
||||
DECL_ADS( 2, ssse3 )
|
||||
DECL_ADS( 1, ssse3 )
|
||||
DECL_ADS( 4, avx )
|
||||
DECL_ADS( 2, avx )
|
||||
DECL_ADS( 1, avx )
|
||||
DECL_ADS( 4, avx2 )
|
||||
DECL_ADS( 2, avx2 )
|
||||
DECL_ADS( 1, avx2 )
|
||||
|
||||
#undef DECL_PIXELS
|
||||
#undef DECL_X1
|
||||
#undef DECL_X4
|
||||
#undef DECL_ADS
|
||||
|
||||
#endif
|
||||
2181
common/x86/predict-a.asm
Normal file
2181
common/x86/predict-a.asm
Normal file
File diff suppressed because it is too large
Load Diff
620
common/x86/predict-c.c
Normal file
620
common/x86/predict-c.c
Normal file
@@ -0,0 +1,620 @@
|
||||
/*****************************************************************************
|
||||
* predict-c.c: intra prediction
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2003-2025 x264 project
|
||||
*
|
||||
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||
* Loren Merritt <lorenm@u.washington.edu>
|
||||
* Fiona Glaser <fiona@x264.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common/common.h"
|
||||
#include "predict.h"
|
||||
#include "pixel.h"
|
||||
|
||||
#define PREDICT_P_SUM(j,i)\
|
||||
H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
|
||||
V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );
|
||||
|
||||
#if HAVE_X86_INLINE_ASM
|
||||
#if HIGH_BIT_DEPTH
|
||||
ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8};
|
||||
ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
|
||||
ALIGNED_16( static const int16_t pw_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
|
||||
#else // !HIGH_BIT_DEPTH
|
||||
ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
|
||||
ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
|
||||
ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
#endif // HAVE_X86_INLINE_ASM
|
||||
|
||||
#define PREDICT_16x16_P_CORE\
|
||||
int H = 0;\
|
||||
int V = 0;\
|
||||
PREDICT_P_SUM(7,1)\
|
||||
PREDICT_P_SUM(7,2)\
|
||||
PREDICT_P_SUM(7,3)\
|
||||
PREDICT_P_SUM(7,4)\
|
||||
PREDICT_P_SUM(7,5)\
|
||||
PREDICT_P_SUM(7,6)\
|
||||
PREDICT_P_SUM(7,7)\
|
||||
PREDICT_P_SUM(7,8)
|
||||
|
||||
#define PREDICT_16x16_P_END(name)\
|
||||
int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
|
||||
int b = ( 5 * H + 32 ) >> 6;\
|
||||
int c = ( 5 * V + 32 ) >> 6;\
|
||||
int i00 = a - b * 7 - c * 7 + 16;\
|
||||
/* b*15 + c*15 can overflow: it's easier to just branch away in this rare case\
|
||||
* than to try to consider it in the asm. */\
|
||||
if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )\
|
||||
x264_predict_16x16_p_c( src );\
|
||||
else\
|
||||
x264_predict_16x16_p_core_##name( src, i00, b, c );
|
||||
|
||||
#define PREDICT_16x16_P(name, name2)\
|
||||
static void predict_16x16_p_##name( pixel *src )\
|
||||
{\
|
||||
PREDICT_16x16_P_CORE\
|
||||
PREDICT_16x16_P_END(name2)\
|
||||
}
|
||||
|
||||
#if HAVE_X86_INLINE_ASM
|
||||
#if HIGH_BIT_DEPTH
|
||||
#define PREDICT_16x16_P_ASM\
|
||||
asm (\
|
||||
"movdqu %1, %%xmm1 \n"\
|
||||
"movdqa %2, %%xmm0 \n"\
|
||||
"pmaddwd %3, %%xmm0 \n"\
|
||||
"pmaddwd %4, %%xmm1 \n"\
|
||||
"paddd %%xmm1, %%xmm0 \n"\
|
||||
"movhlps %%xmm0, %%xmm1 \n"\
|
||||
"paddd %%xmm1, %%xmm0 \n"\
|
||||
"pshuflw $14, %%xmm0, %%xmm1 \n"\
|
||||
"paddd %%xmm1, %%xmm0 \n"\
|
||||
"movd %%xmm0, %0 \n"\
|
||||
:"=r"(H)\
|
||||
:"m"(MEM_FIX(&src[-FDEC_STRIDE-1], const pixel, 8)),\
|
||||
"m"(MEM_FIX(&src[-FDEC_STRIDE+8], const pixel, 8)),\
|
||||
"m"(MEM_FIX(pw_12345678, const int16_t, 8)),\
|
||||
"m"(MEM_FIX(pw_m87654321, const int16_t, 8))\
|
||||
:"xmm0", "xmm1"\
|
||||
);
|
||||
#else // !HIGH_BIT_DEPTH
|
||||
#define PREDICT_16x16_P_ASM\
|
||||
asm (\
|
||||
"movq %1, %%mm1 \n"\
|
||||
"movq %2, %%mm0 \n"\
|
||||
"palignr $7, %3, %%mm1 \n"\
|
||||
"pmaddubsw %4, %%mm0 \n"\
|
||||
"pmaddubsw %5, %%mm1 \n"\
|
||||
"paddw %%mm1, %%mm0 \n"\
|
||||
"pshufw $14, %%mm0, %%mm1 \n"\
|
||||
"paddw %%mm1, %%mm0 \n"\
|
||||
"pshufw $1, %%mm0, %%mm1 \n"\
|
||||
"paddw %%mm1, %%mm0 \n"\
|
||||
"movd %%mm0, %0 \n"\
|
||||
"movswl %w0, %0 \n"\
|
||||
:"=r"(H)\
|
||||
:"m"(MEM_FIX(&src[-FDEC_STRIDE], const pixel, 8)),\
|
||||
"m"(MEM_FIX(&src[-FDEC_STRIDE+8], const pixel, 8)),\
|
||||
"m"(MEM_FIX(&src[-FDEC_STRIDE-8], const pixel, 8)),\
|
||||
"m"(MEM_FIX(pb_12345678, const int8_t, 8)),\
|
||||
"m"(MEM_FIX(pb_m87654321, const int8_t, 8))\
|
||||
:"mm0", "mm1"\
|
||||
);
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
|
||||
#define PREDICT_16x16_P_CORE_INLINE\
|
||||
int H, V;\
|
||||
PREDICT_16x16_P_ASM\
|
||||
V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )\
|
||||
+ 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )\
|
||||
+ 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )\
|
||||
+ 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )\
|
||||
+ 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )\
|
||||
+ 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )\
|
||||
+ 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )\
|
||||
+ 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] );
|
||||
|
||||
#define PREDICT_16x16_P_INLINE(name, name2)\
|
||||
static void predict_16x16_p_##name( pixel *src )\
|
||||
{\
|
||||
PREDICT_16x16_P_CORE_INLINE\
|
||||
PREDICT_16x16_P_END(name2)\
|
||||
}
|
||||
#else // !HAVE_X86_INLINE_ASM
|
||||
#define PREDICT_16x16_P_INLINE(name, name2) PREDICT_16x16_P(name, name2)
|
||||
#endif // HAVE_X86_INLINE_ASM
|
||||
|
||||
#if HIGH_BIT_DEPTH
|
||||
PREDICT_16x16_P_INLINE( sse2, sse2 )
|
||||
#else // !HIGH_BIT_DEPTH
|
||||
#if !ARCH_X86_64
|
||||
PREDICT_16x16_P( mmx2, mmx2 )
|
||||
#endif // !ARCH_X86_64
|
||||
PREDICT_16x16_P( sse2, sse2 )
|
||||
#if HAVE_X86_INLINE_ASM
|
||||
PREDICT_16x16_P_INLINE( ssse3, sse2 )
|
||||
#endif // HAVE_X86_INLINE_ASM
|
||||
PREDICT_16x16_P_INLINE( avx, avx )
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
PREDICT_16x16_P_INLINE( avx2, avx2 )
|
||||
|
||||
#define PREDICT_8x16C_P_CORE\
|
||||
int H = 0, V = 0;\
|
||||
for( int i = 0; i < 4; i++ )\
|
||||
H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\
|
||||
for( int i = 0; i < 8; i++ )\
|
||||
V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );
|
||||
|
||||
#if HIGH_BIT_DEPTH
|
||||
#define PREDICT_8x16C_P_END(name)\
|
||||
int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
|
||||
int b = ( 17 * H + 16 ) >> 5;\
|
||||
int c = ( 5 * V + 32 ) >> 6;\
|
||||
x264_predict_8x16c_p_core_##name( src, a, b, c );
|
||||
#else // !HIGH_BIT_DEPTH
|
||||
#define PREDICT_8x16C_P_END(name)\
|
||||
int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
|
||||
int b = ( 17 * H + 16 ) >> 5;\
|
||||
int c = ( 5 * V + 32 ) >> 6;\
|
||||
int i00 = a -3*b -7*c + 16;\
|
||||
x264_predict_8x16c_p_core_##name( src, i00, b, c );
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
|
||||
#define PREDICT_8x16C_P(name)\
|
||||
static void predict_8x16c_p_##name( pixel *src )\
|
||||
{\
|
||||
PREDICT_8x16C_P_CORE\
|
||||
PREDICT_8x16C_P_END(name)\
|
||||
}
|
||||
|
||||
#if !ARCH_X86_64 && !HIGH_BIT_DEPTH
|
||||
PREDICT_8x16C_P( mmx2 )
|
||||
#endif // !ARCH_X86_64 && !HIGH_BIT_DEPTH
|
||||
PREDICT_8x16C_P( sse2 )
|
||||
PREDICT_8x16C_P( avx )
|
||||
PREDICT_8x16C_P( avx2 )
|
||||
|
||||
#define PREDICT_8x8C_P_CORE\
|
||||
int H = 0;\
|
||||
int V = 0;\
|
||||
PREDICT_P_SUM(3,1)\
|
||||
PREDICT_P_SUM(3,2)\
|
||||
PREDICT_P_SUM(3,3)\
|
||||
PREDICT_P_SUM(3,4)
|
||||
|
||||
#if HIGH_BIT_DEPTH
|
||||
#define PREDICT_8x8C_P_END(name)\
|
||||
int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
|
||||
int b = ( 17 * H + 16 ) >> 5;\
|
||||
int c = ( 17 * V + 16 ) >> 5;\
|
||||
x264_predict_8x8c_p_core_##name( src, a, b, c );
|
||||
#else // !HIGH_BIT_DEPTH
|
||||
#define PREDICT_8x8C_P_END(name)\
|
||||
int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
|
||||
int b = ( 17 * H + 16 ) >> 5;\
|
||||
int c = ( 17 * V + 16 ) >> 5;\
|
||||
int i00 = a -3*b -3*c + 16;\
|
||||
x264_predict_8x8c_p_core_##name( src, i00, b, c );
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
|
||||
#define PREDICT_8x8C_P(name, name2)\
|
||||
static void predict_8x8c_p_##name( pixel *src )\
|
||||
{\
|
||||
PREDICT_8x8C_P_CORE\
|
||||
PREDICT_8x8C_P_END(name2)\
|
||||
}
|
||||
|
||||
#if HAVE_X86_INLINE_ASM
|
||||
#if HIGH_BIT_DEPTH
|
||||
#define PREDICT_8x8C_P_ASM\
|
||||
asm (\
|
||||
"movdqa %1, %%xmm0 \n"\
|
||||
"pmaddwd %2, %%xmm0 \n"\
|
||||
"movhlps %%xmm0, %%xmm1 \n"\
|
||||
"paddd %%xmm1, %%xmm0 \n"\
|
||||
"pshuflw $14, %%xmm0, %%xmm1 \n"\
|
||||
"paddd %%xmm1, %%xmm0 \n"\
|
||||
"movd %%xmm0, %0 \n"\
|
||||
:"=r"(H)\
|
||||
:"m"(MEM_FIX(&src[-FDEC_STRIDE], const pixel, 8)),\
|
||||
"m"(MEM_FIX(pw_m32101234, const int16_t, 8))\
|
||||
:"xmm0", "xmm1"\
|
||||
);
|
||||
#else // !HIGH_BIT_DEPTH
|
||||
#define PREDICT_8x8C_P_ASM\
|
||||
asm (\
|
||||
"movq %1, %%mm0 \n"\
|
||||
"pmaddubsw %2, %%mm0 \n"\
|
||||
"pshufw $14, %%mm0, %%mm1 \n"\
|
||||
"paddw %%mm1, %%mm0 \n"\
|
||||
"pshufw $1, %%mm0, %%mm1 \n"\
|
||||
"paddw %%mm1, %%mm0 \n"\
|
||||
"movd %%mm0, %0 \n"\
|
||||
"movswl %w0, %0 \n"\
|
||||
:"=r"(H)\
|
||||
:"m"(MEM_FIX(&src[-FDEC_STRIDE], const pixel, 8)),\
|
||||
"m"(MEM_FIX(pb_m32101234, const int8_t, 8))\
|
||||
:"mm0", "mm1"\
|
||||
);
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
|
||||
#define PREDICT_8x8C_P_CORE_INLINE\
|
||||
int H, V;\
|
||||
PREDICT_8x8C_P_ASM\
|
||||
V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\
|
||||
+ 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\
|
||||
+ 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\
|
||||
+ 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\
|
||||
H += -4 * src[-1*FDEC_STRIDE -1];
|
||||
|
||||
#define PREDICT_8x8C_P_INLINE(name, name2)\
|
||||
static void predict_8x8c_p_##name( pixel *src )\
|
||||
{\
|
||||
PREDICT_8x8C_P_CORE_INLINE\
|
||||
PREDICT_8x8C_P_END(name2)\
|
||||
}
|
||||
#else // !HAVE_X86_INLINE_ASM
|
||||
#define PREDICT_8x8C_P_INLINE(name, name2) PREDICT_8x8C_P(name, name2)
|
||||
#endif // HAVE_X86_INLINE_ASM
|
||||
|
||||
#if HIGH_BIT_DEPTH
|
||||
PREDICT_8x8C_P_INLINE( sse2, sse2 )
|
||||
#else //!HIGH_BIT_DEPTH
|
||||
#if !ARCH_X86_64
|
||||
PREDICT_8x8C_P( mmx2, mmx2 )
|
||||
#endif // !ARCH_X86_64
|
||||
PREDICT_8x8C_P( sse2, sse2 )
|
||||
#if HAVE_X86_INLINE_ASM
|
||||
PREDICT_8x8C_P_INLINE( ssse3, sse2 )
|
||||
#endif // HAVE_X86_INLINE_ASM
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
PREDICT_8x8C_P_INLINE( avx, avx )
|
||||
PREDICT_8x8C_P_INLINE( avx2, avx2 )
|
||||
|
||||
#if ARCH_X86_64 && !HIGH_BIT_DEPTH
|
||||
static void predict_8x8c_dc_left( uint8_t *src )
|
||||
{
|
||||
int y;
|
||||
uint32_t s0 = 0, s1 = 0;
|
||||
uint64_t dc0, dc1;
|
||||
|
||||
for( y = 0; y < 4; y++ )
|
||||
{
|
||||
s0 += src[y * FDEC_STRIDE - 1];
|
||||
s1 += src[(y+4) * FDEC_STRIDE - 1];
|
||||
}
|
||||
dc0 = (( s0 + 2 ) >> 2) * 0x0101010101010101ULL;
|
||||
dc1 = (( s1 + 2 ) >> 2) * 0x0101010101010101ULL;
|
||||
|
||||
for( y = 0; y < 4; y++ )
|
||||
{
|
||||
M64( src ) = dc0;
|
||||
src += FDEC_STRIDE;
|
||||
}
|
||||
for( y = 0; y < 4; y++ )
|
||||
{
|
||||
M64( src ) = dc1;
|
||||
src += FDEC_STRIDE;
|
||||
}
|
||||
}
|
||||
#endif // ARCH_X86_64 && !HIGH_BIT_DEPTH
|
||||
|
||||
/****************************************************************************
|
||||
* Exported functions:
|
||||
****************************************************************************/
|
||||
void x264_predict_16x16_init_mmx( uint32_t cpu, x264_predict_t pf[7] )
|
||||
{
|
||||
if( !(cpu&X264_CPU_MMX2) )
|
||||
return;
|
||||
pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx2;
|
||||
pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2;
|
||||
#if HIGH_BIT_DEPTH
|
||||
if( !(cpu&X264_CPU_SSE) )
|
||||
return;
|
||||
pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse;
|
||||
if( !(cpu&X264_CPU_SSE2) )
|
||||
return;
|
||||
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
|
||||
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
|
||||
pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
|
||||
pf[I_PRED_16x16_H] = x264_predict_16x16_h_sse2;
|
||||
pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
|
||||
if( !(cpu&X264_CPU_AVX) )
|
||||
return;
|
||||
pf[I_PRED_16x16_V] = x264_predict_16x16_v_avx;
|
||||
if( !(cpu&X264_CPU_AVX2) )
|
||||
return;
|
||||
pf[I_PRED_16x16_H] = x264_predict_16x16_h_avx2;
|
||||
#else
|
||||
#if !ARCH_X86_64
|
||||
pf[I_PRED_16x16_P] = predict_16x16_p_mmx2;
|
||||
#endif
|
||||
if( !(cpu&X264_CPU_SSE) )
|
||||
return;
|
||||
pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse;
|
||||
if( !(cpu&X264_CPU_SSE2) )
|
||||
return;
|
||||
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
|
||||
if( cpu&X264_CPU_SSE2_IS_SLOW )
|
||||
return;
|
||||
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
|
||||
pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
|
||||
pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
|
||||
if( !(cpu&X264_CPU_SSSE3) )
|
||||
return;
|
||||
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
|
||||
pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3;
|
||||
#if HAVE_X86_INLINE_ASM
|
||||
pf[I_PRED_16x16_P] = predict_16x16_p_ssse3;
|
||||
#endif
|
||||
if( !(cpu&X264_CPU_AVX) )
|
||||
return;
|
||||
pf[I_PRED_16x16_P] = predict_16x16_p_avx;
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
|
||||
if( cpu&X264_CPU_AVX2 )
|
||||
{
|
||||
pf[I_PRED_16x16_P] = predict_16x16_p_avx2;
|
||||
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_avx2;
|
||||
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_avx2;
|
||||
pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_avx2;
|
||||
}
|
||||
}
|
||||
|
||||
void x264_predict_8x8c_init_mmx( uint32_t cpu, x264_predict_t pf[7] )
|
||||
{
|
||||
if( !(cpu&X264_CPU_MMX) )
|
||||
return;
|
||||
#if HIGH_BIT_DEPTH
|
||||
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx;
|
||||
if( !(cpu&X264_CPU_MMX2) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2;
|
||||
if( !(cpu&X264_CPU_SSE) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse;
|
||||
if( !(cpu&X264_CPU_SSE2) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_sse2;
|
||||
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_sse2;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_sse2;
|
||||
pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2;
|
||||
if( !(cpu&X264_CPU_AVX) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_P] = predict_8x8c_p_avx;
|
||||
if( !(cpu&X264_CPU_AVX2) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_avx2;
|
||||
#else
|
||||
#if ARCH_X86_64
|
||||
pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
|
||||
#endif
|
||||
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx;
|
||||
if( !(cpu&X264_CPU_MMX2) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_mmx2;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2;
|
||||
#if !ARCH_X86_64
|
||||
pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmx2;
|
||||
#endif
|
||||
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2;
|
||||
if( !(cpu&X264_CPU_SSE2) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2;
|
||||
if( !(cpu&X264_CPU_SSSE3) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3;
|
||||
#if HAVE_X86_INLINE_ASM
|
||||
pf[I_PRED_CHROMA_P] = predict_8x8c_p_ssse3;
|
||||
#endif
|
||||
if( !(cpu&X264_CPU_AVX) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_P] = predict_8x8c_p_avx;
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
|
||||
if( cpu&X264_CPU_AVX2 )
|
||||
{
|
||||
pf[I_PRED_CHROMA_P] = predict_8x8c_p_avx2;
|
||||
}
|
||||
}
|
||||
|
||||
void x264_predict_8x16c_init_mmx( uint32_t cpu, x264_predict_t pf[7] )
|
||||
{
|
||||
if( !(cpu&X264_CPU_MMX) )
|
||||
return;
|
||||
#if HIGH_BIT_DEPTH
|
||||
if( !(cpu&X264_CPU_MMX2) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
|
||||
if( !(cpu&X264_CPU_SSE) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse;
|
||||
if( !(cpu&X264_CPU_SSE2) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2;
|
||||
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2;
|
||||
pf[I_PRED_CHROMA_P] = predict_8x16c_p_sse2;
|
||||
if( !(cpu&X264_CPU_AVX) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_P] = predict_8x16c_p_avx;
|
||||
if( !(cpu&X264_CPU_AVX2) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_avx2;
|
||||
#else
|
||||
pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx;
|
||||
if( !(cpu&X264_CPU_MMX2) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2;
|
||||
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
|
||||
#if !ARCH_X86_64
|
||||
pf[I_PRED_CHROMA_P] = predict_8x16c_p_mmx2;
|
||||
#endif
|
||||
if( !(cpu&X264_CPU_SSE2) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_P] = predict_8x16c_p_sse2;
|
||||
if( !(cpu&X264_CPU_SSSE3) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_ssse3;
|
||||
if( !(cpu&X264_CPU_AVX) )
|
||||
return;
|
||||
pf[I_PRED_CHROMA_P] = predict_8x16c_p_avx;
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
|
||||
if( cpu&X264_CPU_AVX2 )
|
||||
{
|
||||
pf[I_PRED_CHROMA_P] = predict_8x16c_p_avx2;
|
||||
}
|
||||
}
|
||||
|
||||
void x264_predict_8x8_init_mmx( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
|
||||
{
|
||||
if( !(cpu&X264_CPU_MMX2) )
|
||||
return;
|
||||
#if HIGH_BIT_DEPTH
|
||||
if( !(cpu&X264_CPU_SSE) )
|
||||
return;
|
||||
pf[I_PRED_8x8_V] = x264_predict_8x8_v_sse;
|
||||
if( !(cpu&X264_CPU_SSE2) )
|
||||
return;
|
||||
pf[I_PRED_8x8_H] = x264_predict_8x8_h_sse2;
|
||||
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_sse2;
|
||||
pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_sse2;
|
||||
pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_sse2;
|
||||
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2;
|
||||
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2;
|
||||
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2;
|
||||
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2;
|
||||
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2;
|
||||
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
|
||||
*predict_8x8_filter = x264_predict_8x8_filter_sse2;
|
||||
if( !(cpu&X264_CPU_SSSE3) )
|
||||
return;
|
||||
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3;
|
||||
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_ssse3;
|
||||
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3;
|
||||
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
|
||||
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_ssse3;
|
||||
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
|
||||
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
|
||||
if( cpu&X264_CPU_CACHELINE_64 )
|
||||
{
|
||||
pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_cache64_ssse3;
|
||||
pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_cache64_ssse3;
|
||||
}
|
||||
if( !(cpu&X264_CPU_AVX) )
|
||||
return;
|
||||
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
|
||||
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_avx;
|
||||
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx;
|
||||
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
|
||||
*predict_8x8_filter = x264_predict_8x8_filter_avx;
|
||||
#else
|
||||
pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmx2;
|
||||
pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmx2;
|
||||
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_mmx2;
|
||||
pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmx2;
|
||||
pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmx2;
|
||||
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmx2;
|
||||
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_mmx2;
|
||||
*predict_8x8_filter = x264_predict_8x8_filter_mmx2;
|
||||
#if ARCH_X86
|
||||
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmx2;
|
||||
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_mmx2;
|
||||
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_mmx2;
|
||||
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_mmx2;
|
||||
#endif
|
||||
if( !(cpu&X264_CPU_SSE2) )
|
||||
return;
|
||||
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2;
|
||||
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2;
|
||||
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2;
|
||||
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2;
|
||||
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2;
|
||||
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
|
||||
if( !(cpu&X264_CPU_SSSE3) )
|
||||
return;
|
||||
if( !(cpu&X264_CPU_SLOW_PALIGNR) )
|
||||
{
|
||||
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3;
|
||||
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
|
||||
}
|
||||
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
|
||||
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
|
||||
if( !(cpu&X264_CPU_AVX) )
|
||||
return;
|
||||
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_avx;
|
||||
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_avx;
|
||||
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx;
|
||||
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
|
||||
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
}
|
||||
|
||||
void x264_predict_4x4_init_mmx( uint32_t cpu, x264_predict_t pf[12] )
|
||||
{
|
||||
if( !(cpu&X264_CPU_MMX2) )
|
||||
return;
|
||||
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_mmx2;
|
||||
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmx2;
|
||||
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmx2;
|
||||
pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_mmx2;
|
||||
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_mmx2;
|
||||
pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_mmx2;
|
||||
#if HIGH_BIT_DEPTH
|
||||
if( !(cpu&X264_CPU_SSE2) )
|
||||
return;
|
||||
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2;
|
||||
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_sse2;
|
||||
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_sse2;
|
||||
pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_sse2;
|
||||
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_sse2;
|
||||
if( !(cpu&X264_CPU_SSSE3) )
|
||||
return;
|
||||
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
|
||||
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
|
||||
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
|
||||
if( !(cpu&X264_CPU_AVX) )
|
||||
return;
|
||||
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_avx;
|
||||
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_avx;
|
||||
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_avx;
|
||||
pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_avx;
|
||||
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_avx;
|
||||
if( !(cpu&X264_CPU_AVX2) )
|
||||
return;
|
||||
pf[I_PRED_4x4_H] = x264_predict_4x4_h_avx2;
|
||||
#else
|
||||
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmx2;
|
||||
if( !(cpu&X264_CPU_SSSE3) )
|
||||
return;
|
||||
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
|
||||
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
|
||||
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
|
||||
if( cpu&X264_CPU_CACHELINE_64 )
|
||||
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_cache64_ssse3;
|
||||
#endif // HIGH_BIT_DEPTH
|
||||
}
|
||||
256
common/x86/predict.h
Normal file
256
common/x86/predict.h
Normal file
@@ -0,0 +1,256 @@
|
||||
/*****************************************************************************
|
||||
* predict.h: x86 intra prediction
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2003-2025 x264 project
|
||||
*
|
||||
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
|
||||
* Loren Merritt <lorenm@u.washington.edu>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_X86_PREDICT_H
|
||||
#define X264_X86_PREDICT_H
|
||||
|
||||
#define x264_predict_16x16_init_mmx x264_template(predict_16x16_init_mmx)
|
||||
void x264_predict_16x16_init_mmx( uint32_t cpu, x264_predict_t pf[7] );
|
||||
#define x264_predict_8x16c_init_mmx x264_template(predict_8x16c_init_mmx)
|
||||
void x264_predict_8x16c_init_mmx( uint32_t cpu, x264_predict_t pf[7] );
|
||||
#define x264_predict_8x8c_init_mmx x264_template(predict_8x8c_init_mmx)
|
||||
void x264_predict_8x8c_init_mmx ( uint32_t cpu, x264_predict_t pf[7] );
|
||||
#define x264_predict_4x4_init_mmx x264_template(predict_4x4_init_mmx)
|
||||
void x264_predict_4x4_init_mmx ( uint32_t cpu, x264_predict_t pf[12] );
|
||||
#define x264_predict_8x8_init_mmx x264_template(predict_8x8_init_mmx)
|
||||
void x264_predict_8x8_init_mmx ( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );
|
||||
|
||||
#define x264_predict_16x16_v_mmx2 x264_template(predict_16x16_v_mmx2)
|
||||
void x264_predict_16x16_v_mmx2( pixel *src );
|
||||
#define x264_predict_16x16_v_sse x264_template(predict_16x16_v_sse)
|
||||
void x264_predict_16x16_v_sse ( pixel *src );
|
||||
#define x264_predict_16x16_v_avx x264_template(predict_16x16_v_avx)
|
||||
void x264_predict_16x16_v_avx ( uint16_t *src );
|
||||
#define x264_predict_16x16_h_mmx2 x264_template(predict_16x16_h_mmx2)
|
||||
void x264_predict_16x16_h_mmx2( pixel *src );
|
||||
#define x264_predict_16x16_h_sse2 x264_template(predict_16x16_h_sse2)
|
||||
void x264_predict_16x16_h_sse2( uint16_t *src );
|
||||
#define x264_predict_16x16_h_ssse3 x264_template(predict_16x16_h_ssse3)
|
||||
void x264_predict_16x16_h_ssse3( uint8_t *src );
|
||||
#define x264_predict_16x16_h_avx2 x264_template(predict_16x16_h_avx2)
|
||||
void x264_predict_16x16_h_avx2( uint16_t *src );
|
||||
#define x264_predict_16x16_dc_sse2 x264_template(predict_16x16_dc_sse2)
|
||||
void x264_predict_16x16_dc_sse2( pixel *src );
|
||||
#define x264_predict_16x16_dc_avx2 x264_template(predict_16x16_dc_avx2)
|
||||
void x264_predict_16x16_dc_avx2( pixel *src );
|
||||
#define x264_predict_16x16_dc_left_sse2 x264_template(predict_16x16_dc_left_sse2)
|
||||
void x264_predict_16x16_dc_left_sse2( pixel *src );
|
||||
#define x264_predict_16x16_dc_left_avx2 x264_template(predict_16x16_dc_left_avx2)
|
||||
void x264_predict_16x16_dc_left_avx2( pixel *src );
|
||||
#define x264_predict_16x16_dc_top_sse2 x264_template(predict_16x16_dc_top_sse2)
|
||||
void x264_predict_16x16_dc_top_sse2( pixel *src );
|
||||
#define x264_predict_16x16_dc_top_avx2 x264_template(predict_16x16_dc_top_avx2)
|
||||
void x264_predict_16x16_dc_top_avx2( pixel *src );
|
||||
#define x264_predict_16x16_p_core_mmx2 x264_template(predict_16x16_p_core_mmx2)
|
||||
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
|
||||
#define x264_predict_16x16_p_core_sse2 x264_template(predict_16x16_p_core_sse2)
|
||||
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
|
||||
#define x264_predict_16x16_p_core_avx x264_template(predict_16x16_p_core_avx)
|
||||
void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
|
||||
#define x264_predict_16x16_p_core_avx2 x264_template(predict_16x16_p_core_avx2)
|
||||
void x264_predict_16x16_p_core_avx2( pixel *src, int i00, int b, int c );
|
||||
#define x264_predict_8x16c_dc_mmx2 x264_template(predict_8x16c_dc_mmx2)
|
||||
void x264_predict_8x16c_dc_mmx2( pixel *src );
|
||||
#define x264_predict_8x16c_dc_sse2 x264_template(predict_8x16c_dc_sse2)
|
||||
void x264_predict_8x16c_dc_sse2( uint16_t *src );
|
||||
#define x264_predict_8x16c_dc_top_mmx2 x264_template(predict_8x16c_dc_top_mmx2)
|
||||
void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
|
||||
#define x264_predict_8x16c_dc_top_sse2 x264_template(predict_8x16c_dc_top_sse2)
|
||||
void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
|
||||
#define x264_predict_8x16c_v_mmx x264_template(predict_8x16c_v_mmx)
|
||||
void x264_predict_8x16c_v_mmx( uint8_t *src );
|
||||
#define x264_predict_8x16c_v_sse x264_template(predict_8x16c_v_sse)
|
||||
void x264_predict_8x16c_v_sse( uint16_t *src );
|
||||
#define x264_predict_8x16c_h_mmx2 x264_template(predict_8x16c_h_mmx2)
|
||||
void x264_predict_8x16c_h_mmx2( pixel *src );
|
||||
#define x264_predict_8x16c_h_sse2 x264_template(predict_8x16c_h_sse2)
|
||||
void x264_predict_8x16c_h_sse2( uint16_t *src );
|
||||
#define x264_predict_8x16c_h_ssse3 x264_template(predict_8x16c_h_ssse3)
|
||||
void x264_predict_8x16c_h_ssse3( uint8_t *src );
|
||||
#define x264_predict_8x16c_h_avx2 x264_template(predict_8x16c_h_avx2)
|
||||
void x264_predict_8x16c_h_avx2( uint16_t *src );
|
||||
#define x264_predict_8x16c_p_core_mmx2 x264_template(predict_8x16c_p_core_mmx2)
|
||||
void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
|
||||
#define x264_predict_8x16c_p_core_sse2 x264_template(predict_8x16c_p_core_sse2)
|
||||
void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c );
|
||||
#define x264_predict_8x16c_p_core_avx x264_template(predict_8x16c_p_core_avx)
|
||||
void x264_predict_8x16c_p_core_avx ( pixel *src, int i00, int b, int c );
|
||||
#define x264_predict_8x16c_p_core_avx2 x264_template(predict_8x16c_p_core_avx2)
|
||||
void x264_predict_8x16c_p_core_avx2( pixel *src, int i00, int b, int c );
|
||||
#define x264_predict_8x8c_p_core_mmx2 x264_template(predict_8x8c_p_core_mmx2)
|
||||
void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
|
||||
#define x264_predict_8x8c_p_core_sse2 x264_template(predict_8x8c_p_core_sse2)
|
||||
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
|
||||
#define x264_predict_8x8c_p_core_avx x264_template(predict_8x8c_p_core_avx)
|
||||
void x264_predict_8x8c_p_core_avx ( pixel *src, int i00, int b, int c );
|
||||
#define x264_predict_8x8c_p_core_avx2 x264_template(predict_8x8c_p_core_avx2)
|
||||
void x264_predict_8x8c_p_core_avx2( pixel *src, int i00, int b, int c );
|
||||
#define x264_predict_8x8c_dc_mmx2 x264_template(predict_8x8c_dc_mmx2)
|
||||
void x264_predict_8x8c_dc_mmx2( pixel *src );
|
||||
#define x264_predict_8x8c_dc_sse2 x264_template(predict_8x8c_dc_sse2)
|
||||
void x264_predict_8x8c_dc_sse2( uint16_t *src );
|
||||
#define x264_predict_8x8c_dc_top_mmx2 x264_template(predict_8x8c_dc_top_mmx2)
|
||||
void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
|
||||
#define x264_predict_8x8c_dc_top_sse2 x264_template(predict_8x8c_dc_top_sse2)
|
||||
void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
|
||||
#define x264_predict_8x8c_v_mmx x264_template(predict_8x8c_v_mmx)
|
||||
void x264_predict_8x8c_v_mmx( pixel *src );
|
||||
#define x264_predict_8x8c_v_sse x264_template(predict_8x8c_v_sse)
|
||||
void x264_predict_8x8c_v_sse( uint16_t *src );
|
||||
#define x264_predict_8x8c_h_mmx2 x264_template(predict_8x8c_h_mmx2)
|
||||
void x264_predict_8x8c_h_mmx2( pixel *src );
|
||||
#define x264_predict_8x8c_h_sse2 x264_template(predict_8x8c_h_sse2)
|
||||
void x264_predict_8x8c_h_sse2( uint16_t *src );
|
||||
#define x264_predict_8x8c_h_ssse3 x264_template(predict_8x8c_h_ssse3)
|
||||
void x264_predict_8x8c_h_ssse3( uint8_t *src );
|
||||
#define x264_predict_8x8c_h_avx2 x264_template(predict_8x8c_h_avx2)
|
||||
void x264_predict_8x8c_h_avx2( uint16_t *src );
|
||||
#define x264_predict_8x8_v_mmx2 x264_template(predict_8x8_v_mmx2)
|
||||
void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_v_sse x264_template(predict_8x8_v_sse)
|
||||
void x264_predict_8x8_v_sse ( uint16_t *src, uint16_t edge[36] );
|
||||
#define x264_predict_8x8_h_mmx2 x264_template(predict_8x8_h_mmx2)
|
||||
void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_h_sse2 x264_template(predict_8x8_h_sse2)
|
||||
void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] );
|
||||
#define x264_predict_8x8_hd_mmx2 x264_template(predict_8x8_hd_mmx2)
|
||||
void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_hu_mmx2 x264_template(predict_8x8_hu_mmx2)
|
||||
void x264_predict_8x8_hu_mmx2( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_dc_mmx2 x264_template(predict_8x8_dc_mmx2)
|
||||
void x264_predict_8x8_dc_mmx2( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_dc_sse2 x264_template(predict_8x8_dc_sse2)
|
||||
void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[36] );
|
||||
#define x264_predict_8x8_dc_top_mmx2 x264_template(predict_8x8_dc_top_mmx2)
|
||||
void x264_predict_8x8_dc_top_mmx2( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_dc_top_sse2 x264_template(predict_8x8_dc_top_sse2)
|
||||
void x264_predict_8x8_dc_top_sse2( uint16_t *src, uint16_t edge[36] );
|
||||
#define x264_predict_8x8_dc_left_mmx2 x264_template(predict_8x8_dc_left_mmx2)
|
||||
void x264_predict_8x8_dc_left_mmx2( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_dc_left_sse2 x264_template(predict_8x8_dc_left_sse2)
|
||||
void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] );
|
||||
#define x264_predict_8x8_ddl_mmx2 x264_template(predict_8x8_ddl_mmx2)
|
||||
void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_ddl_sse2 x264_template(predict_8x8_ddl_sse2)
|
||||
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_ddl_ssse3 x264_template(predict_8x8_ddl_ssse3)
|
||||
void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_ddl_cache64_ssse3 x264_template(predict_8x8_ddl_cache64_ssse3)
|
||||
void x264_predict_8x8_ddl_cache64_ssse3( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_ddl_avx x264_template(predict_8x8_ddl_avx)
|
||||
void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_ddr_mmx2 x264_template(predict_8x8_ddr_mmx2)
|
||||
void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_ddr_sse2 x264_template(predict_8x8_ddr_sse2)
|
||||
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_ddr_ssse3 x264_template(predict_8x8_ddr_ssse3)
|
||||
void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_ddr_cache64_ssse3 x264_template(predict_8x8_ddr_cache64_ssse3)
|
||||
void x264_predict_8x8_ddr_cache64_ssse3( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_ddr_avx x264_template(predict_8x8_ddr_avx)
|
||||
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_vl_sse2 x264_template(predict_8x8_vl_sse2)
|
||||
void x264_predict_8x8_vl_sse2( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_vl_ssse3 x264_template(predict_8x8_vl_ssse3)
|
||||
void x264_predict_8x8_vl_ssse3( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_vl_avx x264_template(predict_8x8_vl_avx)
|
||||
void x264_predict_8x8_vl_avx( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_vl_mmx2 x264_template(predict_8x8_vl_mmx2)
|
||||
void x264_predict_8x8_vl_mmx2( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_vr_mmx2 x264_template(predict_8x8_vr_mmx2)
|
||||
void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] );
|
||||
#define x264_predict_8x8_vr_sse2 x264_template(predict_8x8_vr_sse2)
|
||||
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_vr_ssse3 x264_template(predict_8x8_vr_ssse3)
|
||||
void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_vr_avx x264_template(predict_8x8_vr_avx)
|
||||
void x264_predict_8x8_vr_avx( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_hu_sse2 x264_template(predict_8x8_hu_sse2)
|
||||
void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_hu_ssse3 x264_template(predict_8x8_hu_ssse3)
|
||||
void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_hu_avx x264_template(predict_8x8_hu_avx)
|
||||
void x264_predict_8x8_hu_avx( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_hd_sse2 x264_template(predict_8x8_hd_sse2)
|
||||
void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_hd_ssse3 x264_template(predict_8x8_hd_ssse3)
|
||||
void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_hd_avx x264_template(predict_8x8_hd_avx)
|
||||
void x264_predict_8x8_hd_avx( pixel *src, pixel edge[36] );
|
||||
#define x264_predict_8x8_filter_mmx2 x264_template(predict_8x8_filter_mmx2)
|
||||
void x264_predict_8x8_filter_mmx2( uint8_t *src, uint8_t edge[36], int i_neighbor, int i_filters );
|
||||
#define x264_predict_8x8_filter_sse2 x264_template(predict_8x8_filter_sse2)
|
||||
void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
|
||||
#define x264_predict_8x8_filter_ssse3 x264_template(predict_8x8_filter_ssse3)
|
||||
void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters );
|
||||
#define x264_predict_8x8_filter_avx x264_template(predict_8x8_filter_avx)
|
||||
void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
|
||||
#define x264_predict_4x4_h_avx2 x264_template(predict_4x4_h_avx2)
|
||||
void x264_predict_4x4_h_avx2( uint16_t *src );
|
||||
#define x264_predict_4x4_ddl_mmx2 x264_template(predict_4x4_ddl_mmx2)
|
||||
void x264_predict_4x4_ddl_mmx2( pixel *src );
|
||||
#define x264_predict_4x4_ddl_sse2 x264_template(predict_4x4_ddl_sse2)
|
||||
void x264_predict_4x4_ddl_sse2( uint16_t *src );
|
||||
#define x264_predict_4x4_ddl_avx x264_template(predict_4x4_ddl_avx)
|
||||
void x264_predict_4x4_ddl_avx( uint16_t *src );
|
||||
#define x264_predict_4x4_ddr_mmx2 x264_template(predict_4x4_ddr_mmx2)
|
||||
void x264_predict_4x4_ddr_mmx2( pixel *src );
|
||||
#define x264_predict_4x4_vl_mmx2 x264_template(predict_4x4_vl_mmx2)
|
||||
void x264_predict_4x4_vl_mmx2( pixel *src );
|
||||
#define x264_predict_4x4_vl_sse2 x264_template(predict_4x4_vl_sse2)
|
||||
void x264_predict_4x4_vl_sse2( uint16_t *src );
|
||||
#define x264_predict_4x4_vl_avx x264_template(predict_4x4_vl_avx)
|
||||
void x264_predict_4x4_vl_avx( uint16_t *src );
|
||||
#define x264_predict_4x4_vr_mmx2 x264_template(predict_4x4_vr_mmx2)
|
||||
void x264_predict_4x4_vr_mmx2( uint8_t *src );
|
||||
#define x264_predict_4x4_vr_sse2 x264_template(predict_4x4_vr_sse2)
|
||||
void x264_predict_4x4_vr_sse2( uint16_t *src );
|
||||
#define x264_predict_4x4_vr_ssse3 x264_template(predict_4x4_vr_ssse3)
|
||||
void x264_predict_4x4_vr_ssse3( pixel *src );
|
||||
#define x264_predict_4x4_vr_cache64_ssse3 x264_template(predict_4x4_vr_cache64_ssse3)
|
||||
void x264_predict_4x4_vr_cache64_ssse3( uint8_t *src );
|
||||
#define x264_predict_4x4_vr_avx x264_template(predict_4x4_vr_avx)
|
||||
void x264_predict_4x4_vr_avx( uint16_t *src );
|
||||
#define x264_predict_4x4_hd_mmx2 x264_template(predict_4x4_hd_mmx2)
|
||||
void x264_predict_4x4_hd_mmx2( pixel *src );
|
||||
#define x264_predict_4x4_hd_sse2 x264_template(predict_4x4_hd_sse2)
|
||||
void x264_predict_4x4_hd_sse2( uint16_t *src );
|
||||
#define x264_predict_4x4_hd_ssse3 x264_template(predict_4x4_hd_ssse3)
|
||||
void x264_predict_4x4_hd_ssse3( pixel *src );
|
||||
#define x264_predict_4x4_hd_avx x264_template(predict_4x4_hd_avx)
|
||||
void x264_predict_4x4_hd_avx( uint16_t *src );
|
||||
#define x264_predict_4x4_dc_mmx2 x264_template(predict_4x4_dc_mmx2)
|
||||
void x264_predict_4x4_dc_mmx2( pixel *src );
|
||||
#define x264_predict_4x4_ddr_sse2 x264_template(predict_4x4_ddr_sse2)
|
||||
void x264_predict_4x4_ddr_sse2( uint16_t *src );
|
||||
#define x264_predict_4x4_ddr_ssse3 x264_template(predict_4x4_ddr_ssse3)
|
||||
void x264_predict_4x4_ddr_ssse3( pixel *src );
|
||||
#define x264_predict_4x4_ddr_avx x264_template(predict_4x4_ddr_avx)
|
||||
void x264_predict_4x4_ddr_avx( uint16_t *src );
|
||||
#define x264_predict_4x4_hu_mmx2 x264_template(predict_4x4_hu_mmx2)
|
||||
void x264_predict_4x4_hu_mmx2( pixel *src );
|
||||
|
||||
#endif
|
||||
2269
common/x86/quant-a.asm
Normal file
2269
common/x86/quant-a.asm
Normal file
File diff suppressed because it is too large
Load Diff
278
common/x86/quant.h
Normal file
278
common/x86/quant.h
Normal file
@@ -0,0 +1,278 @@
|
||||
/*****************************************************************************
|
||||
* quant.h: x86 quantization and level-run
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2005-2025 x264 project
|
||||
*
|
||||
* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
* Fiona Glaser <fiona@x264.com>
|
||||
* Christian Heine <sennindemokrit@gmx.net>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_X86_QUANT_H
|
||||
#define X264_X86_QUANT_H
|
||||
|
||||
#define x264_quant_2x2_dc_mmx2 x264_template(quant_2x2_dc_mmx2)
|
||||
int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
|
||||
#define x264_quant_4x4_dc_mmx2 x264_template(quant_4x4_dc_mmx2)
|
||||
int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
|
||||
#define x264_quant_4x4_mmx2 x264_template(quant_4x4_mmx2)
|
||||
int x264_quant_4x4_mmx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
|
||||
#define x264_quant_8x8_mmx2 x264_template(quant_8x8_mmx2)
|
||||
int x264_quant_8x8_mmx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
|
||||
#define x264_quant_2x2_dc_sse2 x264_template(quant_2x2_dc_sse2)
|
||||
int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
|
||||
#define x264_quant_4x4_dc_sse2 x264_template(quant_4x4_dc_sse2)
|
||||
int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
|
||||
#define x264_quant_4x4_sse2 x264_template(quant_4x4_sse2)
|
||||
int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
|
||||
#define x264_quant_4x4x4_sse2 x264_template(quant_4x4x4_sse2)
|
||||
int x264_quant_4x4x4_sse2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
|
||||
#define x264_quant_8x8_sse2 x264_template(quant_8x8_sse2)
|
||||
int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
|
||||
#define x264_quant_2x2_dc_ssse3 x264_template(quant_2x2_dc_ssse3)
|
||||
int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias );
|
||||
#define x264_quant_4x4_dc_ssse3 x264_template(quant_4x4_dc_ssse3)
|
||||
int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias );
|
||||
#define x264_quant_4x4_ssse3 x264_template(quant_4x4_ssse3)
|
||||
int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
|
||||
#define x264_quant_4x4x4_ssse3 x264_template(quant_4x4x4_ssse3)
|
||||
int x264_quant_4x4x4_ssse3( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
|
||||
#define x264_quant_8x8_ssse3 x264_template(quant_8x8_ssse3)
|
||||
int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
|
||||
#define x264_quant_2x2_dc_sse4 x264_template(quant_2x2_dc_sse4)
|
||||
int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias );
|
||||
#define x264_quant_4x4_dc_sse4 x264_template(quant_4x4_dc_sse4)
|
||||
int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias );
|
||||
#define x264_quant_4x4_sse4 x264_template(quant_4x4_sse4)
|
||||
int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
|
||||
#define x264_quant_4x4x4_sse4 x264_template(quant_4x4x4_sse4)
|
||||
int x264_quant_4x4x4_sse4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
|
||||
#define x264_quant_8x8_sse4 x264_template(quant_8x8_sse4)
|
||||
int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
|
||||
#define x264_quant_4x4_avx2 x264_template(quant_4x4_avx2)
|
||||
int x264_quant_4x4_avx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
|
||||
#define x264_quant_4x4_dc_avx2 x264_template(quant_4x4_dc_avx2)
|
||||
int x264_quant_4x4_dc_avx2( dctcoef dct[16], int mf, int bias );
|
||||
#define x264_quant_8x8_avx2 x264_template(quant_8x8_avx2)
|
||||
int x264_quant_8x8_avx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
|
||||
#define x264_quant_4x4x4_avx2 x264_template(quant_4x4x4_avx2)
|
||||
int x264_quant_4x4x4_avx2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
|
||||
#define x264_dequant_4x4_mmx x264_template(dequant_4x4_mmx)
|
||||
void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_4x4dc_mmx2 x264_template(dequant_4x4dc_mmx2)
|
||||
void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_8x8_mmx x264_template(dequant_8x8_mmx)
|
||||
void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
|
||||
#define x264_dequant_4x4_sse2 x264_template(dequant_4x4_sse2)
|
||||
void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_4x4dc_sse2 x264_template(dequant_4x4dc_sse2)
|
||||
void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_8x8_sse2 x264_template(dequant_8x8_sse2)
|
||||
void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
|
||||
#define x264_dequant_4x4_avx x264_template(dequant_4x4_avx)
|
||||
void x264_dequant_4x4_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_4x4dc_avx x264_template(dequant_4x4dc_avx)
|
||||
void x264_dequant_4x4dc_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_8x8_avx x264_template(dequant_8x8_avx)
|
||||
void x264_dequant_8x8_avx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
|
||||
#define x264_dequant_4x4_xop x264_template(dequant_4x4_xop)
|
||||
void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_4x4dc_xop x264_template(dequant_4x4dc_xop)
|
||||
void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_8x8_xop x264_template(dequant_8x8_xop)
|
||||
void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
|
||||
#define x264_dequant_4x4_avx2 x264_template(dequant_4x4_avx2)
|
||||
void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_4x4dc_avx2 x264_template(dequant_4x4dc_avx2)
|
||||
void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_8x8_avx2 x264_template(dequant_8x8_avx2)
|
||||
void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
|
||||
#define x264_dequant_4x4_avx512 x264_template(dequant_4x4_avx512)
|
||||
void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_8x8_avx512 x264_template(dequant_8x8_avx512)
|
||||
void x264_dequant_8x8_avx512( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
|
||||
#define x264_dequant_4x4_flat16_mmx x264_template(dequant_4x4_flat16_mmx)
|
||||
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_8x8_flat16_mmx x264_template(dequant_8x8_flat16_mmx)
|
||||
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
|
||||
#define x264_dequant_4x4_flat16_sse2 x264_template(dequant_4x4_flat16_sse2)
|
||||
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_8x8_flat16_sse2 x264_template(dequant_8x8_flat16_sse2)
|
||||
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
|
||||
#define x264_dequant_4x4_flat16_avx2 x264_template(dequant_4x4_flat16_avx2)
|
||||
void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_dequant_8x8_flat16_avx2 x264_template(dequant_8x8_flat16_avx2)
|
||||
void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
|
||||
#define x264_dequant_8x8_flat16_avx512 x264_template(dequant_8x8_flat16_avx512)
|
||||
void x264_dequant_8x8_flat16_avx512( int16_t dct[64], int dequant_mf[6][64], int i_qp );
|
||||
#define x264_idct_dequant_2x4_dc_sse2 x264_template(idct_dequant_2x4_dc_sse2)
|
||||
void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_idct_dequant_2x4_dc_avx x264_template(idct_dequant_2x4_dc_avx)
|
||||
void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_idct_dequant_2x4_dconly_sse2 x264_template(idct_dequant_2x4_dconly_sse2)
|
||||
void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_idct_dequant_2x4_dconly_avx x264_template(idct_dequant_2x4_dconly_avx)
|
||||
void x264_idct_dequant_2x4_dconly_avx ( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
|
||||
#define x264_optimize_chroma_2x2_dc_sse2 x264_template(optimize_chroma_2x2_dc_sse2)
|
||||
int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
|
||||
#define x264_optimize_chroma_2x2_dc_ssse3 x264_template(optimize_chroma_2x2_dc_ssse3)
|
||||
int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
|
||||
#define x264_optimize_chroma_2x2_dc_sse4 x264_template(optimize_chroma_2x2_dc_sse4)
|
||||
int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
|
||||
#define x264_optimize_chroma_2x2_dc_avx x264_template(optimize_chroma_2x2_dc_avx)
|
||||
int x264_optimize_chroma_2x2_dc_avx( dctcoef dct[4], int dequant_mf );
|
||||
#define x264_denoise_dct_mmx x264_template(denoise_dct_mmx)
|
||||
void x264_denoise_dct_mmx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
|
||||
#define x264_denoise_dct_sse2 x264_template(denoise_dct_sse2)
|
||||
void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
|
||||
#define x264_denoise_dct_ssse3 x264_template(denoise_dct_ssse3)
|
||||
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
|
||||
#define x264_denoise_dct_avx x264_template(denoise_dct_avx)
|
||||
void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
|
||||
#define x264_denoise_dct_avx2 x264_template(denoise_dct_avx2)
|
||||
void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
|
||||
#define x264_decimate_score15_sse2 x264_template(decimate_score15_sse2)
|
||||
int x264_decimate_score15_sse2( dctcoef *dct );
|
||||
#define x264_decimate_score15_ssse3 x264_template(decimate_score15_ssse3)
|
||||
int x264_decimate_score15_ssse3( dctcoef *dct );
|
||||
#define x264_decimate_score15_avx512 x264_template(decimate_score15_avx512)
|
||||
int x264_decimate_score15_avx512( dctcoef *dct );
|
||||
#define x264_decimate_score16_sse2 x264_template(decimate_score16_sse2)
|
||||
int x264_decimate_score16_sse2( dctcoef *dct );
|
||||
#define x264_decimate_score16_ssse3 x264_template(decimate_score16_ssse3)
|
||||
int x264_decimate_score16_ssse3( dctcoef *dct );
|
||||
#define x264_decimate_score16_avx512 x264_template(decimate_score16_avx512)
|
||||
int x264_decimate_score16_avx512( dctcoef *dct );
|
||||
#define x264_decimate_score64_sse2 x264_template(decimate_score64_sse2)
|
||||
int x264_decimate_score64_sse2( dctcoef *dct );
|
||||
#define x264_decimate_score64_ssse3 x264_template(decimate_score64_ssse3)
|
||||
int x264_decimate_score64_ssse3( dctcoef *dct );
|
||||
#define x264_decimate_score64_avx2 x264_template(decimate_score64_avx2)
|
||||
int x264_decimate_score64_avx2( int16_t *dct );
|
||||
#define x264_decimate_score64_avx512 x264_template(decimate_score64_avx512)
|
||||
int x264_decimate_score64_avx512( dctcoef *dct );
|
||||
#define x264_coeff_last4_mmx2 x264_template(coeff_last4_mmx2)
|
||||
int x264_coeff_last4_mmx2( dctcoef *dct );
|
||||
#define x264_coeff_last8_mmx2 x264_template(coeff_last8_mmx2)
|
||||
int x264_coeff_last8_mmx2( dctcoef *dct );
|
||||
#define x264_coeff_last15_mmx2 x264_template(coeff_last15_mmx2)
|
||||
int x264_coeff_last15_mmx2( dctcoef *dct );
|
||||
#define x264_coeff_last16_mmx2 x264_template(coeff_last16_mmx2)
|
||||
int x264_coeff_last16_mmx2( dctcoef *dct );
|
||||
#define x264_coeff_last64_mmx2 x264_template(coeff_last64_mmx2)
|
||||
int x264_coeff_last64_mmx2( dctcoef *dct );
|
||||
#define x264_coeff_last8_sse2 x264_template(coeff_last8_sse2)
|
||||
int x264_coeff_last8_sse2( dctcoef *dct );
|
||||
#define x264_coeff_last15_sse2 x264_template(coeff_last15_sse2)
|
||||
int x264_coeff_last15_sse2( dctcoef *dct );
|
||||
#define x264_coeff_last16_sse2 x264_template(coeff_last16_sse2)
|
||||
int x264_coeff_last16_sse2( dctcoef *dct );
|
||||
#define x264_coeff_last64_sse2 x264_template(coeff_last64_sse2)
|
||||
int x264_coeff_last64_sse2( dctcoef *dct );
|
||||
#define x264_coeff_last4_lzcnt x264_template(coeff_last4_lzcnt)
|
||||
int x264_coeff_last4_lzcnt( dctcoef *dct );
|
||||
#define x264_coeff_last8_lzcnt x264_template(coeff_last8_lzcnt)
|
||||
int x264_coeff_last8_lzcnt( dctcoef *dct );
|
||||
#define x264_coeff_last15_lzcnt x264_template(coeff_last15_lzcnt)
|
||||
int x264_coeff_last15_lzcnt( dctcoef *dct );
|
||||
#define x264_coeff_last16_lzcnt x264_template(coeff_last16_lzcnt)
|
||||
int x264_coeff_last16_lzcnt( dctcoef *dct );
|
||||
#define x264_coeff_last64_lzcnt x264_template(coeff_last64_lzcnt)
|
||||
int x264_coeff_last64_lzcnt( dctcoef *dct );
|
||||
#define x264_coeff_last64_avx2 x264_template(coeff_last64_avx2)
|
||||
int x264_coeff_last64_avx2 ( dctcoef *dct );
|
||||
#define x264_coeff_last4_avx512 x264_template(coeff_last4_avx512)
|
||||
int x264_coeff_last4_avx512( int32_t *dct );
|
||||
#define x264_coeff_last8_avx512 x264_template(coeff_last8_avx512)
|
||||
int x264_coeff_last8_avx512( dctcoef *dct );
|
||||
#define x264_coeff_last15_avx512 x264_template(coeff_last15_avx512)
|
||||
int x264_coeff_last15_avx512( dctcoef *dct );
|
||||
#define x264_coeff_last16_avx512 x264_template(coeff_last16_avx512)
|
||||
int x264_coeff_last16_avx512( dctcoef *dct );
|
||||
#define x264_coeff_last64_avx512 x264_template(coeff_last64_avx512)
|
||||
int x264_coeff_last64_avx512( dctcoef *dct );
|
||||
#define x264_coeff_level_run16_mmx2 x264_template(coeff_level_run16_mmx2)
|
||||
int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run16_sse2 x264_template(coeff_level_run16_sse2)
|
||||
int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run16_lzcnt x264_template(coeff_level_run16_lzcnt)
|
||||
int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run16_ssse3 x264_template(coeff_level_run16_ssse3)
|
||||
int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run16_ssse3_lzcnt x264_template(coeff_level_run16_ssse3_lzcnt)
|
||||
int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run16_avx2 x264_template(coeff_level_run16_avx2)
|
||||
int x264_coeff_level_run16_avx2( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run15_mmx2 x264_template(coeff_level_run15_mmx2)
|
||||
int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run15_sse2 x264_template(coeff_level_run15_sse2)
|
||||
int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run15_lzcnt x264_template(coeff_level_run15_lzcnt)
|
||||
int x264_coeff_level_run15_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run15_ssse3 x264_template(coeff_level_run15_ssse3)
|
||||
int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run15_ssse3_lzcnt x264_template(coeff_level_run15_ssse3_lzcnt)
|
||||
int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run15_avx2 x264_template(coeff_level_run15_avx2)
|
||||
int x264_coeff_level_run15_avx2( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run4_mmx2 x264_template(coeff_level_run4_mmx2)
|
||||
int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run4_lzcnt x264_template(coeff_level_run4_lzcnt)
|
||||
int x264_coeff_level_run4_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run4_ssse3 x264_template(coeff_level_run4_ssse3)
|
||||
int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run4_ssse3_lzcnt x264_template(coeff_level_run4_ssse3_lzcnt)
|
||||
int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run8_mmx2 x264_template(coeff_level_run8_mmx2)
|
||||
int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run8_lzcnt x264_template(coeff_level_run8_lzcnt)
|
||||
int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run8_sse2 x264_template(coeff_level_run8_sse2)
|
||||
int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run8_ssse3 x264_template(coeff_level_run8_ssse3)
|
||||
int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_coeff_level_run8_ssse3_lzcnt x264_template(coeff_level_run8_ssse3_lzcnt)
|
||||
int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
|
||||
#define x264_trellis_cabac_4x4_sse2 x264_template(trellis_cabac_4x4_sse2)
|
||||
int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );
|
||||
#define x264_trellis_cabac_4x4_ssse3 x264_template(trellis_cabac_4x4_ssse3)
|
||||
int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac );
|
||||
#define x264_trellis_cabac_8x8_sse2 x264_template(trellis_cabac_8x8_sse2)
|
||||
int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced );
|
||||
#define x264_trellis_cabac_8x8_ssse3 x264_template(trellis_cabac_8x8_ssse3)
|
||||
int x264_trellis_cabac_8x8_ssse3( TRELLIS_PARAMS, int b_interlaced );
|
||||
#define x264_trellis_cabac_4x4_psy_sse2 x264_template(trellis_cabac_4x4_psy_sse2)
|
||||
int x264_trellis_cabac_4x4_psy_sse2 ( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int i_psy_trellis );
|
||||
#define x264_trellis_cabac_4x4_psy_ssse3 x264_template(trellis_cabac_4x4_psy_ssse3)
|
||||
int x264_trellis_cabac_4x4_psy_ssse3( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int i_psy_trellis );
|
||||
#define x264_trellis_cabac_8x8_psy_sse2 x264_template(trellis_cabac_8x8_psy_sse2)
|
||||
int x264_trellis_cabac_8x8_psy_sse2 ( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int i_psy_trellis );
|
||||
#define x264_trellis_cabac_8x8_psy_ssse3 x264_template(trellis_cabac_8x8_psy_ssse3)
|
||||
int x264_trellis_cabac_8x8_psy_ssse3( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int i_psy_trellis );
|
||||
#define x264_trellis_cabac_dc_sse2 x264_template(trellis_cabac_dc_sse2)
|
||||
int x264_trellis_cabac_dc_sse2 ( TRELLIS_PARAMS, int i_coefs );
|
||||
#define x264_trellis_cabac_dc_ssse3 x264_template(trellis_cabac_dc_ssse3)
|
||||
int x264_trellis_cabac_dc_ssse3( TRELLIS_PARAMS, int i_coefs );
|
||||
#define x264_trellis_cabac_chroma_422_dc_sse2 x264_template(trellis_cabac_chroma_422_dc_sse2)
|
||||
int x264_trellis_cabac_chroma_422_dc_sse2 ( TRELLIS_PARAMS );
|
||||
#define x264_trellis_cabac_chroma_422_dc_ssse3 x264_template(trellis_cabac_chroma_422_dc_ssse3)
|
||||
int x264_trellis_cabac_chroma_422_dc_ssse3( TRELLIS_PARAMS );
|
||||
|
||||
#endif
|
||||
2215
common/x86/sad-a.asm
Normal file
2215
common/x86/sad-a.asm
Normal file
File diff suppressed because it is too large
Load Diff
727
common/x86/sad16-a.asm
Normal file
727
common/x86/sad16-a.asm
Normal file
@@ -0,0 +1,727 @@
|
||||
;*****************************************************************************
|
||||
;* sad16-a.asm: x86 high depth sad functions
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2010-2025 x264 project
|
||||
;*
|
||||
;* Authors: Oskar Arvidsson <oskar@irock.se>
|
||||
;* Henrik Gramner <henrik@gramner.com>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at licensing@x264.com.
|
||||
;*****************************************************************************
|
||||
|
||||
%include "x86inc.asm"
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION .text
|
||||
|
||||
cextern pw_1
|
||||
cextern pw_4
|
||||
cextern pw_8
|
||||
|
||||
;=============================================================================
|
||||
; SAD MMX
|
||||
;=============================================================================
|
||||
|
||||
%macro SAD_INC_1x16P_MMX 0
|
||||
movu m1, [r0+ 0]
|
||||
movu m2, [r0+ 8]
|
||||
movu m3, [r0+16]
|
||||
movu m4, [r0+24]
|
||||
psubw m1, [r2+ 0]
|
||||
psubw m2, [r2+ 8]
|
||||
psubw m3, [r2+16]
|
||||
psubw m4, [r2+24]
|
||||
ABSW2 m1, m2, m1, m2, m5, m6
|
||||
ABSW2 m3, m4, m3, m4, m7, m5
|
||||
lea r0, [r0+2*r1]
|
||||
lea r2, [r2+2*r3]
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
paddw m0, m1
|
||||
paddw m0, m3
|
||||
%endmacro
|
||||
|
||||
%macro SAD_INC_2x8P_MMX 0
|
||||
movu m1, [r0+0]
|
||||
movu m2, [r0+8]
|
||||
movu m3, [r0+2*r1+0]
|
||||
movu m4, [r0+2*r1+8]
|
||||
psubw m1, [r2+0]
|
||||
psubw m2, [r2+8]
|
||||
psubw m3, [r2+2*r3+0]
|
||||
psubw m4, [r2+2*r3+8]
|
||||
ABSW2 m1, m2, m1, m2, m5, m6
|
||||
ABSW2 m3, m4, m3, m4, m7, m5
|
||||
lea r0, [r0+4*r1]
|
||||
lea r2, [r2+4*r3]
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
paddw m0, m1
|
||||
paddw m0, m3
|
||||
%endmacro
|
||||
|
||||
%macro SAD_INC_2x4P_MMX 0
|
||||
movu m1, [r0]
|
||||
movu m2, [r0+2*r1]
|
||||
psubw m1, [r2]
|
||||
psubw m2, [r2+2*r3]
|
||||
ABSW2 m1, m2, m1, m2, m3, m4
|
||||
lea r0, [r0+4*r1]
|
||||
lea r2, [r2+4*r3]
|
||||
paddw m0, m1
|
||||
paddw m0, m2
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro SAD_MMX 3
|
||||
cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
|
||||
pxor m0, m0
|
||||
%if %2 == 4
|
||||
SAD_INC_%3x%1P_MMX
|
||||
SAD_INC_%3x%1P_MMX
|
||||
%else
|
||||
mov r4d, %2/%3
|
||||
.loop:
|
||||
SAD_INC_%3x%1P_MMX
|
||||
dec r4d
|
||||
jg .loop
|
||||
%endif
|
||||
%if %1*%2 == 256
|
||||
HADDUW m0, m1
|
||||
%else
|
||||
HADDW m0, m1
|
||||
%endif
|
||||
movd eax, m0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx2
|
||||
SAD_MMX 16, 16, 1
|
||||
SAD_MMX 16, 8, 1
|
||||
SAD_MMX 8, 16, 2
|
||||
SAD_MMX 8, 8, 2
|
||||
SAD_MMX 8, 4, 2
|
||||
SAD_MMX 4, 8, 2
|
||||
SAD_MMX 4, 4, 2
|
||||
INIT_MMX ssse3
|
||||
SAD_MMX 4, 8, 2
|
||||
SAD_MMX 4, 4, 2
|
||||
|
||||
;=============================================================================
|
||||
; SAD XMM
|
||||
;=============================================================================
|
||||
|
||||
%macro SAD_INC_2ROW 1
|
||||
%if 2*%1 > mmsize
|
||||
movu m1, [r2+ 0]
|
||||
movu m2, [r2+16]
|
||||
movu m3, [r2+2*r3+ 0]
|
||||
movu m4, [r2+2*r3+16]
|
||||
psubw m1, [r0+ 0]
|
||||
psubw m2, [r0+16]
|
||||
psubw m3, [r0+2*r1+ 0]
|
||||
psubw m4, [r0+2*r1+16]
|
||||
ABSW2 m1, m2, m1, m2, m5, m6
|
||||
lea r0, [r0+4*r1]
|
||||
lea r2, [r2+4*r3]
|
||||
ABSW2 m3, m4, m3, m4, m7, m5
|
||||
paddw m1, m2
|
||||
paddw m3, m4
|
||||
paddw m0, m1
|
||||
paddw m0, m3
|
||||
%else
|
||||
movu m1, [r2]
|
||||
movu m2, [r2+2*r3]
|
||||
psubw m1, [r0]
|
||||
psubw m2, [r0+2*r1]
|
||||
ABSW2 m1, m2, m1, m2, m3, m4
|
||||
lea r0, [r0+4*r1]
|
||||
lea r2, [r2+4*r3]
|
||||
paddw m0, m1
|
||||
paddw m0, m2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro SAD 2
|
||||
cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
|
||||
pxor m0, m0
|
||||
%if %2 == 4
|
||||
SAD_INC_2ROW %1
|
||||
SAD_INC_2ROW %1
|
||||
%else
|
||||
mov r4d, %2/2
|
||||
.loop:
|
||||
SAD_INC_2ROW %1
|
||||
dec r4d
|
||||
jg .loop
|
||||
%endif
|
||||
HADDW m0, m1
|
||||
movd eax, xm0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
SAD 16, 16
|
||||
SAD 16, 8
|
||||
SAD 8, 16
|
||||
SAD 8, 8
|
||||
SAD 8, 4
|
||||
INIT_XMM sse2, aligned
|
||||
SAD 16, 16
|
||||
SAD 16, 8
|
||||
SAD 8, 16
|
||||
SAD 8, 8
|
||||
INIT_XMM ssse3
|
||||
SAD 16, 16
|
||||
SAD 16, 8
|
||||
SAD 8, 16
|
||||
SAD 8, 8
|
||||
SAD 8, 4
|
||||
INIT_XMM ssse3, aligned
|
||||
SAD 16, 16
|
||||
SAD 16, 8
|
||||
SAD 8, 16
|
||||
SAD 8, 8
|
||||
INIT_YMM avx2
|
||||
SAD 16, 16
|
||||
SAD 16, 8
|
||||
|
||||
;=============================================================================
|
||||
; SAD x3/x4
|
||||
;=============================================================================
|
||||
|
||||
%macro SAD_X3_INC_P 0
|
||||
add r0, 4*FENC_STRIDE
|
||||
lea r1, [r1+4*r4]
|
||||
lea r2, [r2+4*r4]
|
||||
lea r3, [r3+4*r4]
|
||||
%endmacro
|
||||
|
||||
%macro SAD_X3_ONE_START 0
|
||||
mova m3, [r0]
|
||||
movu m0, [r1]
|
||||
movu m1, [r2]
|
||||
movu m2, [r3]
|
||||
psubw m0, m3
|
||||
psubw m1, m3
|
||||
psubw m2, m3
|
||||
ABSW2 m0, m1, m0, m1, m4, m5
|
||||
ABSW m2, m2, m6
|
||||
%endmacro
|
||||
|
||||
%macro SAD_X3_ONE 2
|
||||
mova m6, [r0+%1]
|
||||
movu m3, [r1+%2]
|
||||
movu m4, [r2+%2]
|
||||
movu m5, [r3+%2]
|
||||
psubw m3, m6
|
||||
psubw m4, m6
|
||||
psubw m5, m6
|
||||
ABSW2 m3, m4, m3, m4, m7, m6
|
||||
ABSW m5, m5, m6
|
||||
paddw m0, m3
|
||||
paddw m1, m4
|
||||
paddw m2, m5
|
||||
%endmacro
|
||||
|
||||
%macro SAD_X3_END 2
|
||||
%if mmsize == 8 && %1*%2 == 256
|
||||
HADDUW m0, m3
|
||||
HADDUW m1, m4
|
||||
HADDUW m2, m5
|
||||
%else
|
||||
HADDW m0, m3
|
||||
HADDW m1, m4
|
||||
HADDW m2, m5
|
||||
%endif
|
||||
%if UNIX64
|
||||
movd [r5+0], xm0
|
||||
movd [r5+4], xm1
|
||||
movd [r5+8], xm2
|
||||
%else
|
||||
mov r0, r5mp
|
||||
movd [r0+0], xm0
|
||||
movd [r0+4], xm1
|
||||
movd [r0+8], xm2
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro SAD_X4_INC_P 0
|
||||
add r0, 4*FENC_STRIDE
|
||||
lea r1, [r1+4*r5]
|
||||
lea r2, [r2+4*r5]
|
||||
lea r3, [r3+4*r5]
|
||||
lea r4, [r4+4*r5]
|
||||
%endmacro
|
||||
|
||||
%macro SAD_X4_ONE_START 0
|
||||
mova m4, [r0]
|
||||
movu m0, [r1]
|
||||
movu m1, [r2]
|
||||
movu m2, [r3]
|
||||
movu m3, [r4]
|
||||
psubw m0, m4
|
||||
psubw m1, m4
|
||||
psubw m2, m4
|
||||
psubw m3, m4
|
||||
ABSW2 m0, m1, m0, m1, m5, m6
|
||||
ABSW2 m2, m3, m2, m3, m4, m7
|
||||
%endmacro
|
||||
|
||||
%macro SAD_X4_ONE 2
|
||||
mova m4, [r0+%1]
|
||||
movu m5, [r1+%2]
|
||||
movu m6, [r2+%2]
|
||||
%if num_mmregs > 8
|
||||
movu m7, [r3+%2]
|
||||
movu m8, [r4+%2]
|
||||
psubw m5, m4
|
||||
psubw m6, m4
|
||||
psubw m7, m4
|
||||
psubw m8, m4
|
||||
ABSW2 m5, m6, m5, m6, m9, m10
|
||||
ABSW2 m7, m8, m7, m8, m9, m10
|
||||
paddw m0, m5
|
||||
paddw m1, m6
|
||||
paddw m2, m7
|
||||
paddw m3, m8
|
||||
%elif cpuflag(ssse3)
|
||||
movu m7, [r3+%2]
|
||||
psubw m5, m4
|
||||
psubw m6, m4
|
||||
psubw m7, m4
|
||||
movu m4, [r4+%2]
|
||||
pabsw m5, m5
|
||||
psubw m4, [r0+%1]
|
||||
pabsw m6, m6
|
||||
pabsw m7, m7
|
||||
pabsw m4, m4
|
||||
paddw m0, m5
|
||||
paddw m1, m6
|
||||
paddw m2, m7
|
||||
paddw m3, m4
|
||||
%else ; num_mmregs == 8 && !ssse3
|
||||
psubw m5, m4
|
||||
psubw m6, m4
|
||||
ABSW m5, m5, m7
|
||||
ABSW m6, m6, m7
|
||||
paddw m0, m5
|
||||
paddw m1, m6
|
||||
movu m5, [r3+%2]
|
||||
movu m6, [r4+%2]
|
||||
psubw m5, m4
|
||||
psubw m6, m4
|
||||
ABSW2 m5, m6, m5, m6, m7, m4
|
||||
paddw m2, m5
|
||||
paddw m3, m6
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SAD_X4_END 2
|
||||
%if mmsize == 8 && %1*%2 == 256
|
||||
HADDUW m0, m4
|
||||
HADDUW m1, m5
|
||||
HADDUW m2, m6
|
||||
HADDUW m3, m7
|
||||
%else
|
||||
HADDW m0, m4
|
||||
HADDW m1, m5
|
||||
HADDW m2, m6
|
||||
HADDW m3, m7
|
||||
%endif
|
||||
mov r0, r6mp
|
||||
movd [r0+ 0], xm0
|
||||
movd [r0+ 4], xm1
|
||||
movd [r0+ 8], xm2
|
||||
movd [r0+12], xm3
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro SAD_X_2xNP 4
|
||||
%assign x %3
|
||||
%rep %4
|
||||
SAD_X%1_ONE x*mmsize, x*mmsize
|
||||
SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
|
||||
%assign x x+1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
%macro PIXEL_VSAD 0
|
||||
cglobal pixel_vsad, 3,3,8
|
||||
mova m0, [r0]
|
||||
mova m1, [r0+16]
|
||||
mova m2, [r0+2*r1]
|
||||
mova m3, [r0+2*r1+16]
|
||||
lea r0, [r0+4*r1]
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
ABSW2 m0, m1, m0, m1, m4, m5
|
||||
paddw m0, m1
|
||||
sub r2d, 2
|
||||
je .end
|
||||
.loop:
|
||||
mova m4, [r0]
|
||||
mova m5, [r0+16]
|
||||
mova m6, [r0+2*r1]
|
||||
mova m7, [r0+2*r1+16]
|
||||
lea r0, [r0+4*r1]
|
||||
psubw m2, m4
|
||||
psubw m3, m5
|
||||
psubw m4, m6
|
||||
psubw m5, m7
|
||||
ABSW m2, m2, m1
|
||||
ABSW m3, m3, m1
|
||||
ABSW m4, m4, m1
|
||||
ABSW m5, m5, m1
|
||||
paddw m0, m2
|
||||
paddw m0, m3
|
||||
paddw m0, m4
|
||||
paddw m0, m5
|
||||
mova m2, m6
|
||||
mova m3, m7
|
||||
sub r2d, 2
|
||||
jg .loop
|
||||
.end:
|
||||
%if BIT_DEPTH == 9
|
||||
HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
|
||||
%else
|
||||
HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
|
||||
%endif
|
||||
movd eax, m0
|
||||
RET
|
||||
%endmacro
|
||||
INIT_XMM sse2
|
||||
PIXEL_VSAD
|
||||
INIT_XMM ssse3
|
||||
PIXEL_VSAD
|
||||
INIT_XMM xop
|
||||
PIXEL_VSAD
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal pixel_vsad, 3,3
|
||||
mova m0, [r0]
|
||||
mova m1, [r0+2*r1]
|
||||
lea r0, [r0+4*r1]
|
||||
psubw m0, m1
|
||||
pabsw m0, m0
|
||||
sub r2d, 2
|
||||
je .end
|
||||
.loop:
|
||||
mova m2, [r0]
|
||||
mova m3, [r0+2*r1]
|
||||
lea r0, [r0+4*r1]
|
||||
psubw m1, m2
|
||||
psubw m2, m3
|
||||
pabsw m1, m1
|
||||
pabsw m2, m2
|
||||
paddw m0, m1
|
||||
paddw m0, m2
|
||||
mova m1, m3
|
||||
sub r2d, 2
|
||||
jg .loop
|
||||
.end:
|
||||
%if BIT_DEPTH == 9
|
||||
HADDW m0, m1
|
||||
%else
|
||||
HADDUW m0, m1
|
||||
%endif
|
||||
movd eax, xm0
|
||||
RET
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
|
||||
; uint16_t *pix2, intptr_t i_stride, int scores[3] )
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro SAD_X 3
|
||||
cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
|
||||
%assign regnum %1+1
|
||||
%xdefine STRIDE r %+ regnum
|
||||
mov r6, %3/2-1
|
||||
SAD_X%1_ONE_START
|
||||
SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
|
||||
SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
|
||||
.loop:
|
||||
SAD_X%1_INC_P
|
||||
SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
|
||||
dec r6
|
||||
jg .loop
|
||||
%if %1 == 4
|
||||
mov r6, r6m
|
||||
%endif
|
||||
SAD_X%1_END %2, %3
|
||||
%endmacro
|
||||
|
||||
INIT_MMX mmx2
|
||||
%define XMM_REGS 0
|
||||
SAD_X 3, 16, 16
|
||||
SAD_X 3, 16, 8
|
||||
SAD_X 3, 8, 16
|
||||
SAD_X 3, 8, 8
|
||||
SAD_X 3, 8, 4
|
||||
SAD_X 3, 4, 8
|
||||
SAD_X 3, 4, 4
|
||||
SAD_X 4, 16, 16
|
||||
SAD_X 4, 16, 8
|
||||
SAD_X 4, 8, 16
|
||||
SAD_X 4, 8, 8
|
||||
SAD_X 4, 8, 4
|
||||
SAD_X 4, 4, 8
|
||||
SAD_X 4, 4, 4
|
||||
INIT_MMX ssse3
|
||||
SAD_X 3, 4, 8
|
||||
SAD_X 3, 4, 4
|
||||
SAD_X 4, 4, 8
|
||||
SAD_X 4, 4, 4
|
||||
INIT_XMM ssse3
|
||||
%define XMM_REGS 7
|
||||
SAD_X 3, 16, 16
|
||||
SAD_X 3, 16, 8
|
||||
SAD_X 3, 8, 16
|
||||
SAD_X 3, 8, 8
|
||||
SAD_X 3, 8, 4
|
||||
%define XMM_REGS 9
|
||||
SAD_X 4, 16, 16
|
||||
SAD_X 4, 16, 8
|
||||
SAD_X 4, 8, 16
|
||||
SAD_X 4, 8, 8
|
||||
SAD_X 4, 8, 4
|
||||
INIT_XMM sse2
|
||||
%define XMM_REGS 8
|
||||
SAD_X 3, 16, 16
|
||||
SAD_X 3, 16, 8
|
||||
SAD_X 3, 8, 16
|
||||
SAD_X 3, 8, 8
|
||||
SAD_X 3, 8, 4
|
||||
%define XMM_REGS 11
|
||||
SAD_X 4, 16, 16
|
||||
SAD_X 4, 16, 8
|
||||
SAD_X 4, 8, 16
|
||||
SAD_X 4, 8, 8
|
||||
SAD_X 4, 8, 4
|
||||
INIT_XMM xop
|
||||
%define XMM_REGS 7
|
||||
SAD_X 3, 16, 16
|
||||
SAD_X 3, 16, 8
|
||||
SAD_X 3, 8, 16
|
||||
SAD_X 3, 8, 8
|
||||
SAD_X 3, 8, 4
|
||||
%define XMM_REGS 9
|
||||
SAD_X 4, 16, 16
|
||||
SAD_X 4, 16, 8
|
||||
SAD_X 4, 8, 16
|
||||
SAD_X 4, 8, 8
|
||||
SAD_X 4, 8, 4
|
||||
INIT_YMM avx2
|
||||
%define XMM_REGS 7
|
||||
SAD_X 3, 16, 16
|
||||
SAD_X 3, 16, 8
|
||||
%define XMM_REGS 9
|
||||
SAD_X 4, 16, 16
|
||||
SAD_X 4, 16, 8
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro INTRA_SAD_X3_4x4 0
|
||||
cglobal intra_sad_x3_4x4, 3,3,7
|
||||
%if cpuflag(ssse3)
|
||||
movddup m0, [r1-1*FDEC_STRIDEB]
|
||||
%else
|
||||
movq m0, [r1-1*FDEC_STRIDEB]
|
||||
punpcklqdq m0, m0
|
||||
%endif
|
||||
movq m1, [r0+0*FENC_STRIDEB]
|
||||
movq m2, [r0+2*FENC_STRIDEB]
|
||||
pshuflw m6, m0, q1032
|
||||
paddw m6, m0
|
||||
pshuflw m5, m6, q2301
|
||||
paddw m6, m5
|
||||
punpcklqdq m6, m6 ; A+B+C+D 8 times
|
||||
movhps m1, [r0+1*FENC_STRIDEB]
|
||||
movhps m2, [r0+3*FENC_STRIDEB]
|
||||
psubw m3, m1, m0
|
||||
psubw m0, m2
|
||||
ABSW2 m3, m0, m3, m0, m4, m5
|
||||
paddw m0, m3
|
||||
movd m3, [r1+0*FDEC_STRIDEB-4]
|
||||
movd m4, [r1+2*FDEC_STRIDEB-4]
|
||||
movhps m3, [r1+1*FDEC_STRIDEB-8]
|
||||
movhps m4, [r1+3*FDEC_STRIDEB-8]
|
||||
pshufhw m3, m3, q3333
|
||||
pshufhw m4, m4, q3333
|
||||
pshuflw m3, m3, q1111 ; FF FF EE EE
|
||||
pshuflw m4, m4, q1111 ; HH HH GG GG
|
||||
paddw m5, m3, m4
|
||||
paddw m6, [pw_4]
|
||||
paddw m6, m5
|
||||
pshufd m5, m5, q1032
|
||||
paddw m5, m6
|
||||
psrlw m5, 3
|
||||
psubw m6, m5, m2
|
||||
psubw m5, m1
|
||||
psubw m1, m3
|
||||
psubw m2, m4
|
||||
ABSW2 m5, m6, m5, m6, m3, m4
|
||||
ABSW2 m1, m2, m1, m2, m3, m4
|
||||
paddw m5, m6
|
||||
paddw m1, m2
|
||||
%if cpuflag(ssse3)
|
||||
phaddw m0, m1
|
||||
movhlps m3, m5
|
||||
paddw m5, m3
|
||||
phaddw m0, m5
|
||||
pmaddwd m0, [pw_1]
|
||||
mova [r2], m0
|
||||
%else
|
||||
HADDW m0, m3
|
||||
HADDW m1, m3
|
||||
HADDW m5, m3
|
||||
movd [r2], m0 ; V prediction cost
|
||||
movd [r2+4], m1 ; H prediction cost
|
||||
movd [r2+8], m5 ; DC prediction cost
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
INTRA_SAD_X3_4x4
|
||||
INIT_XMM ssse3
|
||||
INTRA_SAD_X3_4x4
|
||||
INIT_XMM avx
|
||||
INTRA_SAD_X3_4x4
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] );
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
;m0 = DC
|
||||
;m6 = V
|
||||
;m7 = H
|
||||
;m1 = DC score
|
||||
;m2 = V score
|
||||
;m3 = H score
|
||||
;m5 = temp
|
||||
;m4 = pixel row
|
||||
|
||||
%macro INTRA_SAD_HVDC_ITER 2
|
||||
mova m4, [r0+(%1-4)*FENC_STRIDEB]
|
||||
psubw m4, m0
|
||||
ABSW m4, m4, m5
|
||||
ACCUM paddw, 1, 4, %1
|
||||
mova m4, [r0+(%1-4)*FENC_STRIDEB]
|
||||
psubw m4, m6
|
||||
ABSW m4, m4, m5
|
||||
ACCUM paddw, 2, 4, %1
|
||||
pshufd m5, m7, %2
|
||||
psubw m5, [r0+(%1-4)*FENC_STRIDEB]
|
||||
ABSW m5, m5, m4
|
||||
ACCUM paddw, 3, 5, %1
|
||||
%endmacro
|
||||
|
||||
%macro INTRA_SAD_X3_8x8 0
|
||||
cglobal intra_sad_x3_8x8, 3,3,8
|
||||
add r0, 4*FENC_STRIDEB
|
||||
movu m0, [r1+7*SIZEOF_PIXEL]
|
||||
mova m6, [r1+16*SIZEOF_PIXEL] ;V prediction
|
||||
mova m7, m0
|
||||
paddw m0, m6
|
||||
punpckhwd m7, m7
|
||||
HADDW m0, m4
|
||||
paddw m0, [pw_8]
|
||||
psrlw m0, 4
|
||||
SPLATW m0, m0
|
||||
INTRA_SAD_HVDC_ITER 0, q3333
|
||||
INTRA_SAD_HVDC_ITER 1, q2222
|
||||
INTRA_SAD_HVDC_ITER 2, q1111
|
||||
INTRA_SAD_HVDC_ITER 3, q0000
|
||||
movq m7, [r1+7*SIZEOF_PIXEL]
|
||||
punpcklwd m7, m7
|
||||
INTRA_SAD_HVDC_ITER 4, q3333
|
||||
INTRA_SAD_HVDC_ITER 5, q2222
|
||||
INTRA_SAD_HVDC_ITER 6, q1111
|
||||
INTRA_SAD_HVDC_ITER 7, q0000
|
||||
%if cpuflag(ssse3)
|
||||
phaddw m2, m3 ; 2 2 2 2 3 3 3 3
|
||||
movhlps m3, m1
|
||||
paddw m1, m3 ; 1 1 1 1 _ _ _ _
|
||||
phaddw m2, m1 ; 2 2 3 3 1 1 _ _
|
||||
pmaddwd m2, [pw_1] ; 2 3 1 _
|
||||
mova [r2], m2
|
||||
%else
|
||||
HADDW m2, m4
|
||||
HADDW m3, m4
|
||||
HADDW m1, m4
|
||||
movd [r2+0], m2
|
||||
movd [r2+4], m3
|
||||
movd [r2+8], m1
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
INTRA_SAD_X3_8x8
|
||||
INIT_XMM ssse3
|
||||
INTRA_SAD_X3_8x8
|
||||
|
||||
%macro INTRA_SAD_HVDC_ITER_YMM 2
|
||||
mova xm4, [r0+(%1-4)*FENC_STRIDEB]
|
||||
vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1
|
||||
pshufd m5, m7, %2
|
||||
psubw m5, m4
|
||||
pabsw m5, m5
|
||||
ACCUM paddw, 2, 5, %1 ; H
|
||||
psubw m5, m4, m6
|
||||
psubw m4, m0
|
||||
pabsw m5, m5
|
||||
pabsw m4, m4
|
||||
ACCUM paddw, 1, 5, %1 ; V
|
||||
ACCUM paddw, 3, 4, %1 ; DC
|
||||
%endmacro
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal intra_sad_x3_8x8, 3,3,8
|
||||
add r0, 4*FENC_STRIDEB
|
||||
movu xm0, [r1+7*SIZEOF_PIXEL]
|
||||
vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction
|
||||
vpermq m7, m0, q0011
|
||||
paddw xm0, xm6
|
||||
paddw xm0, [pw_1] ; equal to +8 after HADDW
|
||||
HADDW xm0, xm4
|
||||
psrld xm0, 4
|
||||
vpbroadcastw m0, xm0
|
||||
punpcklwd m7, m7
|
||||
INTRA_SAD_HVDC_ITER_YMM 0, q3333
|
||||
INTRA_SAD_HVDC_ITER_YMM 1, q2222
|
||||
INTRA_SAD_HVDC_ITER_YMM 2, q1111
|
||||
INTRA_SAD_HVDC_ITER_YMM 3, q0000
|
||||
phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2
|
||||
punpckhqdq m2, m3, m3
|
||||
paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _
|
||||
phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _
|
||||
vextracti128 xm2, m1, 1
|
||||
paddw xm1, xm2 ; 1 1 2 2 3 3 _ _
|
||||
pmaddwd xm1, [pw_1] ; 1 2 3 _
|
||||
mova [r2], xm1
|
||||
RET
|
||||
881
common/x86/trellis-64.asm
Normal file
881
common/x86/trellis-64.asm
Normal file
@@ -0,0 +1,881 @@
|
||||
;*****************************************************************************
|
||||
;* trellis-64.asm: x86_64 trellis quantization
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2012-2025 x264 project
|
||||
;*
|
||||
;* Authors: Loren Merritt <lorenm@u.washington.edu>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at licensing@x264.com.
|
||||
;*****************************************************************************
|
||||
|
||||
; This is a pretty straight-forward translation of the C code, except:
|
||||
; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level.
|
||||
; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop:
|
||||
; 4x parallel, handling 4 node_ctxs of the same coef (even if some of those
|
||||
; nodes are invalid).
|
||||
; * Interprocedural register allocation. Eliminates argument-passing overhead
|
||||
; to trellis_coef* subroutines. Also reduces codesize.
|
||||
|
||||
; Optimizations that I tried, and rejected because they were not faster:
|
||||
; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3].
|
||||
; Costs too much icache compared to the negligible speedup.
|
||||
; * There are only 21 possible sets of live node_ctxs; we could keep track of
|
||||
; exactly which set we're in and feed that (along with abs_level) into a jump
|
||||
; table instead of the switch to select a trellis_coef subroutine. This would
|
||||
; eliminate all branches about which node_ctxs are live, but costs either a
|
||||
; bunch of icache or a bunch of call/ret, and the jump table itself is
|
||||
; unpredictable.
|
||||
; * Separate versions of trellis_coef* depending on whether we're doing the 1st
|
||||
; or the 2nd of the two abs_level candidates. This would eliminate some
|
||||
; branches about if(score is better).
|
||||
; * Special case more values of coef. I had a coef2 at some intermediate point
|
||||
; in the optimization process, but it didn't end up worthwhile in conjunction
|
||||
; with all the other optimizations.
|
||||
; * Unroll or simd writeback. I don't know why this didn't help.
|
||||
|
||||
%include "x86inc.asm"
|
||||
%include "x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
pd_m16: times 4 dd -16
|
||||
sq_1: dq 1, 0
|
||||
pq_128: times 2 dq 128
|
||||
pq_ffffffff: times 2 dq 0xffffffff
|
||||
|
||||
cextern pd_8
|
||||
cextern pd_0123
|
||||
cextern pd_4567
|
||||
cextern_common cabac_entropy
|
||||
cextern_common cabac_transition
|
||||
cextern cabac_size_unary
|
||||
cextern cabac_transition_unary
|
||||
cextern_common dct4_weight_tab
|
||||
cextern_common dct8_weight_tab
|
||||
cextern_common dct4_weight2_tab
|
||||
cextern_common dct8_weight2_tab
|
||||
cextern_common last_coeff_flag_offset_8x8
|
||||
cextern_common significant_coeff_flag_offset_8x8
|
||||
cextern_common coeff_flag_offset_chroma_422_dc
|
||||
|
||||
SECTION .text
|
||||
|
||||
%define TRELLIS_SCORE_BIAS 1<<60
|
||||
%define SIZEOF_NODE 16
|
||||
%define CABAC_SIZE_BITS 8
|
||||
%define LAMBDA_BITS 4
|
||||
|
||||
%macro SQUARE 2 ; dst, tmp
|
||||
; could use pmuldq here, to eliminate the abs. but that would involve
|
||||
; templating a sse4 version of all of trellis, for negligible speedup.
|
||||
%if cpuflag(ssse3)
|
||||
pabsd m%1, m%1
|
||||
pmuludq m%1, m%1
|
||||
%elif HIGH_BIT_DEPTH
|
||||
ABSD m%2, m%1
|
||||
SWAP %1, %2
|
||||
pmuludq m%1, m%1
|
||||
%else
|
||||
pmuludq m%1, m%1
|
||||
pand m%1, [pq_ffffffff]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_DUP 2 ; dst, src
|
||||
%if cpuflag(ssse3)
|
||||
movddup %1, %2
|
||||
%else
|
||||
movd %1, %2
|
||||
punpcklqdq %1, %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; int trellis_cabac_4x4_psy(
|
||||
; const int *unquant_mf, const uint8_t *zigzag, int lambda2,
|
||||
; int last_nnz, dctcoef *orig_coefs, dctcoef *quant_coefs, dctcoef *dct,
|
||||
; uint8_t *cabac_state_sig, uint8_t *cabac_state_last,
|
||||
; uint64_t level_state0, uint16_t level_state1,
|
||||
; int b_ac, dctcoef *fenc_dct, int psy_trellis )
|
||||
;-----------------------------------------------------------------------------
|
||||
%macro TRELLIS 4
|
||||
%define num_coefs %2
|
||||
%define dc %3
|
||||
%define psy %4
|
||||
cglobal %1, 4,15,9
|
||||
%assign level_tree_size 64*8*2*4 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef*
|
||||
%assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
|
||||
SUB rsp, pad
|
||||
DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
|
||||
%if WIN64
|
||||
%define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
|
||||
%else
|
||||
%define level_statem rsp+stack_offset+32
|
||||
%endif
|
||||
%define b_acm r11m ; 4x4 only
|
||||
%define b_interlacedm r11m ; 8x8 only
|
||||
%define i_coefsm1 r11m ; dc only
|
||||
%define fenc_dctm r12m
|
||||
%define psy_trellism r13m
|
||||
%if num_coefs == 64
|
||||
shl dword b_interlacedm, 6
|
||||
%define dct_weight1_tab dct8_weight_tab
|
||||
%define dct_weight2_tab dct8_weight2_tab
|
||||
%else
|
||||
%define dct_weight1_tab dct4_weight_tab
|
||||
%define dct_weight2_tab dct4_weight2_tab
|
||||
%endif
|
||||
|
||||
%define stack rsp
|
||||
%define last_nnzm [stack+0]
|
||||
%define zigzagm [stack+8]
|
||||
mov last_nnzm, iid
|
||||
mov zigzagm, zigzagq
|
||||
%if WIN64 == 0
|
||||
%define orig_coefsm [stack+16]
|
||||
%define quant_coefsm [stack+24]
|
||||
mov orig_coefsm, orig_coefsq
|
||||
mov quant_coefsm, quant_coefsq
|
||||
%endif
|
||||
%define unquant_mfm [stack+32]
|
||||
%define levelgt1_ctxm [stack+40]
|
||||
%define ssd stack+48
|
||||
%define cost_siglast stack+80
|
||||
%define level_tree stack+96
|
||||
|
||||
; trellis_node_t is laid out differently than C.
|
||||
; struct-of-arrays rather than array-of-structs, for simd.
|
||||
%define nodes_curq r7
|
||||
%define nodes_prevq r8
|
||||
%define node_score(x) x*8
|
||||
%define node_level_idx(x) 64+x*4
|
||||
%define node_cabac_state(x) 96+x*4
|
||||
lea nodes_curq, [level_tree + level_tree_size]
|
||||
lea nodes_prevq, [nodes_curq + 8*SIZEOF_NODE]
|
||||
mov r6, TRELLIS_SCORE_BIAS
|
||||
mov [nodes_curq + node_score(0)], r6
|
||||
mov dword [nodes_curq + node_level_idx(0)], 0
|
||||
movd mm0, [level_statem + 0]
|
||||
punpcklbw mm0, [level_statem + 4]
|
||||
punpcklwd mm0, [level_statem + 8]
|
||||
%define level_state_packed mm0 ; version for copying into node.cabac_state
|
||||
pcmpeqb m7, m7 ; TRELLIS_SCORE_MAX
|
||||
movq [nodes_curq + node_score(1)], m7
|
||||
mova [nodes_curq + node_score(2)], m7
|
||||
|
||||
%define levels_usedq r4
|
||||
%define levels_usedd r4d
|
||||
mov dword [level_tree], 0
|
||||
mov levels_usedd, 1
|
||||
|
||||
%define abs_levelq r9
|
||||
%define abs_leveld r9d
|
||||
%define abs_coefq r14
|
||||
%define zigzagiq r5
|
||||
%define zigzagid r5d
|
||||
|
||||
%if num_coefs == 8
|
||||
mov dword levelgt1_ctxm, 8
|
||||
%else
|
||||
mov dword levelgt1_ctxm, 9
|
||||
%endif
|
||||
%if psy
|
||||
LOAD_DUP m6, psy_trellism
|
||||
%define psy_trellis m6
|
||||
%elif dc
|
||||
LOAD_DUP m6, [unquant_mfq]
|
||||
paddd m6, m6
|
||||
%define unquant_mf m6
|
||||
%endif
|
||||
%if dc == 0
|
||||
mov unquant_mfm, unquant_mfq
|
||||
%endif
|
||||
; Keep a single offset register to PICify all global constants.
|
||||
; They're all relative to "beginning of this asm file's .text section",
|
||||
; even tables that aren't in this file.
|
||||
; (Any address in .text would work, this one was just convenient.)
|
||||
lea r0, [$$]
|
||||
%define GLOBAL +r0-$$
|
||||
|
||||
TRELLIS_LOOP 0 ; node_ctx 0..3
|
||||
TRELLIS_LOOP 1 ; node_ctx 1..7
|
||||
|
||||
.writeback:
|
||||
; int level = bnode->level_idx;
|
||||
; for( int i = b_ac; i <= last_nnz; i++ )
|
||||
; dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]);
|
||||
; level = level_tree[level].next;
|
||||
mov iid, last_nnzm
|
||||
add zigzagq, iiq
|
||||
neg iiq
|
||||
%if num_coefs == 16 && dc == 0
|
||||
mov r2d, b_acm
|
||||
add iiq, r2
|
||||
%endif
|
||||
%define dctq r10
|
||||
mov r0d, [nodes_curq + node_level_idx(0) + rax*4]
|
||||
.writeback_loop:
|
||||
movzx r2, byte [zigzagq + iiq]
|
||||
%if cpuflag(ssse3)
|
||||
movd m0, [level_tree + r0*4]
|
||||
movzx r0, word [level_tree + r0*4]
|
||||
psrld m0, 16
|
||||
movd m1, [dctq + r2*SIZEOF_DCTCOEF]
|
||||
%if HIGH_BIT_DEPTH
|
||||
psignd m0, m1
|
||||
movd [dctq + r2*SIZEOF_DCTCOEF], m0
|
||||
%else
|
||||
psignw m0, m1
|
||||
movd r4d, m0
|
||||
mov [dctq + r2*SIZEOF_DCTCOEF], r4w
|
||||
%endif
|
||||
%else
|
||||
mov r5d, [level_tree + r0*4]
|
||||
%if HIGH_BIT_DEPTH
|
||||
mov r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
|
||||
%else
|
||||
movsx r4d, word [dctq + r2*SIZEOF_DCTCOEF]
|
||||
%endif
|
||||
movzx r0d, r5w
|
||||
sar r4d, 31
|
||||
shr r5d, 16
|
||||
xor r5d, r4d
|
||||
sub r5d, r4d
|
||||
%if HIGH_BIT_DEPTH
|
||||
mov [dctq + r2*SIZEOF_DCTCOEF], r5d
|
||||
%else
|
||||
mov [dctq + r2*SIZEOF_DCTCOEF], r5w
|
||||
%endif
|
||||
%endif
|
||||
inc iiq
|
||||
jle .writeback_loop
|
||||
|
||||
mov eax, 1
|
||||
.return:
|
||||
ADD rsp, pad
|
||||
RET
|
||||
|
||||
%if num_coefs == 16 && dc == 0
|
||||
.return_zero:
|
||||
pxor m0, m0
|
||||
mova [r10+ 0], m0
|
||||
mova [r10+16], m0
|
||||
%if HIGH_BIT_DEPTH
|
||||
mova [r10+32], m0
|
||||
mova [r10+48], m0
|
||||
%endif
|
||||
jmp .return
|
||||
%endif
|
||||
%endmacro ; TRELLIS
|
||||
|
||||
|
||||
|
||||
%macro TRELLIS_LOOP 1 ; ctx_hi
|
||||
.i_loop%1:
|
||||
; if( !quant_coefs[i] )
|
||||
mov r6, quant_coefsm
|
||||
%if HIGH_BIT_DEPTH
|
||||
mov abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
|
||||
%else
|
||||
movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
|
||||
%endif
|
||||
|
||||
; int sigindex = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
|
||||
; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
|
||||
mov r10, cabac_state_sigm
|
||||
%if num_coefs == 64
|
||||
mov r6d, b_interlacedm
|
||||
add r6d, iid
|
||||
movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
|
||||
movzx r10, byte [r10 + r6]
|
||||
%elif num_coefs == 8
|
||||
movzx r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
|
||||
movzx r10, byte [r10 + r13]
|
||||
%else
|
||||
movzx r10, byte [r10 + iiq]
|
||||
%endif
|
||||
|
||||
test abs_leveld, abs_leveld
|
||||
jnz %%.nonzero_quant_coef
|
||||
|
||||
%if %1 == 0
|
||||
; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
|
||||
; * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
|
||||
; nodes_cur[0].score -= cost_sig0;
|
||||
movzx r10, word [cabac_entropy + r10*2 GLOBAL]
|
||||
imul r10, lambda2q
|
||||
shr r10, CABAC_SIZE_BITS - LAMBDA_BITS
|
||||
sub [nodes_curq + node_score(0)], r10
|
||||
%endif
|
||||
ZERO_LEVEL_IDX %1, cur
|
||||
jmp .i_continue%1
|
||||
|
||||
%%.nonzero_quant_coef:
|
||||
; int sign_coef = orig_coefs[zigzag[i]];
|
||||
; int abs_coef = abs( sign_coef );
|
||||
; int q = abs( quant_coefs[i] );
|
||||
movzx zigzagid, byte [zigzagq+iiq]
|
||||
movd m0, abs_leveld
|
||||
mov r6, orig_coefsm
|
||||
%if HIGH_BIT_DEPTH
|
||||
LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
|
||||
%else
|
||||
LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
|
||||
psrad m1, 16 ; sign_coef
|
||||
%endif
|
||||
punpcklqdq m0, m0 ; quant_coef
|
||||
%if cpuflag(ssse3)
|
||||
pabsd m0, m0
|
||||
pabsd m2, m1 ; abs_coef
|
||||
%else
|
||||
pxor m8, m8
|
||||
pcmpgtd m8, m1 ; sign_mask
|
||||
pxor m0, m8
|
||||
pxor m2, m1, m8
|
||||
psubd m0, m8
|
||||
psubd m2, m8
|
||||
%endif
|
||||
psubd m0, [sq_1] ; abs_level
|
||||
movd abs_leveld, m0
|
||||
|
||||
xchg nodes_curq, nodes_prevq
|
||||
|
||||
; if( i < num_coefs-1 )
|
||||
; int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
|
||||
; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i
|
||||
; cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
|
||||
; cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
|
||||
; cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;
|
||||
; cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;
|
||||
%if %1 == 0
|
||||
%if dc && num_coefs != 8
|
||||
cmp iid, i_coefsm1
|
||||
%else
|
||||
cmp iid, num_coefs-1
|
||||
%endif
|
||||
je %%.zero_siglast
|
||||
%endif
|
||||
movzx r11, word [cabac_entropy + r10*2 GLOBAL]
|
||||
xor r10, 1
|
||||
movzx r12, word [cabac_entropy + r10*2 GLOBAL]
|
||||
mov [cost_siglast+0], r11d
|
||||
mov r10, cabac_state_lastm
|
||||
%if num_coefs == 64
|
||||
movzx r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL]
|
||||
movzx r10, byte [r10 + r6]
|
||||
%elif num_coefs == 8
|
||||
movzx r10, byte [r10 + r13]
|
||||
%else
|
||||
movzx r10, byte [r10 + iiq]
|
||||
%endif
|
||||
movzx r11, word [cabac_entropy + r10*2 GLOBAL]
|
||||
add r11, r12
|
||||
mov [cost_siglast+4], r11d
|
||||
%if %1 == 0
|
||||
xor r10, 1
|
||||
movzx r10, word [cabac_entropy + r10*2 GLOBAL]
|
||||
add r10, r12
|
||||
mov [cost_siglast+8], r10d
|
||||
%endif
|
||||
%%.skip_siglast:
|
||||
|
||||
; int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
|
||||
; int d = abs_coef - unquant_abs_level;
|
||||
; uint64_t ssd = (int64_t)d*d * coef_weight[i];
|
||||
%if dc
|
||||
pmuludq m0, unquant_mf
|
||||
%else
|
||||
mov r10, unquant_mfm
|
||||
LOAD_DUP m3, [r10 + zigzagiq*4]
|
||||
pmuludq m0, m3
|
||||
%endif
|
||||
paddd m0, [pq_128]
|
||||
psrld m0, 8 ; unquant_abs_level
|
||||
%if psy || dc == 0
|
||||
mova m4, m0
|
||||
%endif
|
||||
psubd m0, m2
|
||||
SQUARE 0, 3
|
||||
%if dc
|
||||
psllq m0, 8
|
||||
%else
|
||||
LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
|
||||
pmuludq m0, m5
|
||||
%endif
|
||||
|
||||
%if psy
|
||||
test iid, iid
|
||||
jz %%.dc_rounding
|
||||
; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef
|
||||
; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));
|
||||
; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
|
||||
; ssd1[k] -= psy_weight * psy_value;
|
||||
mov r6, fenc_dctm
|
||||
%if HIGH_BIT_DEPTH
|
||||
LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
|
||||
%else
|
||||
LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
|
||||
psrad m3, 16 ; orig_coef
|
||||
%endif
|
||||
%if cpuflag(ssse3)
|
||||
psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef)
|
||||
%else
|
||||
PSIGN d, m4, m8
|
||||
%endif
|
||||
psubd m3, m1 ; predicted_coef
|
||||
paddd m4, m3
|
||||
%if cpuflag(ssse3)
|
||||
pabsd m4, m4
|
||||
%else
|
||||
ABSD m3, m4
|
||||
SWAP 4, 3
|
||||
%endif
|
||||
LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
|
||||
pmuludq m1, psy_trellis
|
||||
pmuludq m4, m1
|
||||
psubq m0, m4
|
||||
%if %1
|
||||
%%.dc_rounding:
|
||||
%endif
|
||||
%endif
|
||||
%if %1 == 0
|
||||
mova [ssd], m0
|
||||
%endif
|
||||
|
||||
%if dc == 0 && %1 == 0
|
||||
test iid, iid
|
||||
jnz %%.skip_dc_rounding
|
||||
%%.dc_rounding:
|
||||
; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks.
|
||||
; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15);
|
||||
; uint64_t ssd = (int64_t)d*d * coef_weight[i];
|
||||
psrad m1, 31 ; sign_coef>>31
|
||||
paddd m4, [pd_8]
|
||||
paddd m4, m1
|
||||
pand m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15
|
||||
psubd m4, m2 ; d
|
||||
SQUARE 4, 3
|
||||
pmuludq m4, m5
|
||||
mova [ssd], m4
|
||||
%%.skip_dc_rounding:
|
||||
%endif
|
||||
mova [ssd+16], m0
|
||||
|
||||
%assign stack_offset_bak stack_offset
|
||||
cmp abs_leveld, 1
|
||||
jl %%.switch_coef0
|
||||
%if %1 == 0
|
||||
mov r10, [ssd] ; trellis_coef* args
|
||||
%endif
|
||||
movq r12, m0
|
||||
; for( int j = 0; j < 8; j++ )
|
||||
; nodes_cur[j].score = TRELLIS_SCORE_MAX;
|
||||
%if cpuflag(ssse3)
|
||||
mova [nodes_curq + node_score(0)], m7
|
||||
mova [nodes_curq + node_score(2)], m7
|
||||
%else ; avoid store-forwarding stalls on k8/k10
|
||||
%if %1 == 0
|
||||
movq [nodes_curq + node_score(0)], m7
|
||||
%endif
|
||||
movq [nodes_curq + node_score(1)], m7
|
||||
movq [nodes_curq + node_score(2)], m7
|
||||
movq [nodes_curq + node_score(3)], m7
|
||||
%endif
|
||||
mova [nodes_curq + node_score(4)], m7
|
||||
mova [nodes_curq + node_score(6)], m7
|
||||
je %%.switch_coef1
|
||||
%%.switch_coefn:
|
||||
call trellis_coefn.entry%1
|
||||
call trellis_coefn.entry%1b
|
||||
jmp .i_continue1
|
||||
%%.switch_coef1:
|
||||
call trellis_coef1.entry%1
|
||||
call trellis_coefn.entry%1b
|
||||
jmp .i_continue1
|
||||
%%.switch_coef0:
|
||||
call trellis_coef0_%1
|
||||
call trellis_coef1.entry%1b
|
||||
|
||||
.i_continue%1:
|
||||
dec iid
|
||||
%if num_coefs == 16 && dc == 0
|
||||
cmp iid, b_acm
|
||||
%endif
|
||||
jge .i_loop%1
|
||||
|
||||
call trellis_bnode_%1
|
||||
%if %1 == 0
|
||||
%if num_coefs == 16 && dc == 0
|
||||
jz .return_zero
|
||||
%else
|
||||
jz .return
|
||||
%endif
|
||||
jmp .writeback
|
||||
|
||||
%%.zero_siglast:
|
||||
xor r6d, r6d
|
||||
mov [cost_siglast+0], r6
|
||||
mov [cost_siglast+8], r6d
|
||||
jmp %%.skip_siglast
|
||||
%endif
|
||||
%endmacro ; TRELLIS_LOOP
|
||||
|
||||
; just a synonym for %if
|
||||
%macro IF0 1+
|
||||
%endmacro
|
||||
%macro IF1 1+
|
||||
%1
|
||||
%endmacro
|
||||
|
||||
%macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev
|
||||
; for( int j = 0; j < 8; j++ )
|
||||
; nodes_cur[j].level_idx = levels_used;
|
||||
; level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 };
|
||||
; levels_used++;
|
||||
add levels_usedd, 3
|
||||
and levels_usedd, ~3 ; allow aligned stores
|
||||
movd m0, levels_usedd
|
||||
pshufd m0, m0, 0
|
||||
IF%1 mova m1, m0
|
||||
paddd m0, [pd_0123]
|
||||
IF%1 paddd m1, [pd_4567]
|
||||
mova m2, [nodes_%2q + node_level_idx(0)]
|
||||
IF%1 mova m3, [nodes_%2q + node_level_idx(4)]
|
||||
mova [nodes_curq + node_level_idx(0)], m0
|
||||
IF%1 mova [nodes_curq + node_level_idx(4)], m1
|
||||
mova [level_tree + (levels_usedq+0)*4], m2
|
||||
IF%1 mova [level_tree + (levels_usedq+4)*4], m3
|
||||
add levels_usedd, (1+%1)*4
|
||||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
TRELLIS trellis_cabac_4x4, 16, 0, 0
|
||||
TRELLIS trellis_cabac_8x8, 64, 0, 0
|
||||
TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
|
||||
TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
|
||||
TRELLIS trellis_cabac_dc, 16, 1, 0
|
||||
TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
|
||||
INIT_XMM ssse3
|
||||
TRELLIS trellis_cabac_4x4, 16, 0, 0
|
||||
TRELLIS trellis_cabac_8x8, 64, 0, 0
|
||||
TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
|
||||
TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
|
||||
TRELLIS trellis_cabac_dc, 16, 1, 0
|
||||
TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
|
||||
|
||||
|
||||
|
||||
%define stack rsp+gprsize
|
||||
%define scoreq r14
|
||||
%define bitsq r13
|
||||
%define bitsd r13d
|
||||
|
||||
INIT_XMM
|
||||
%macro clocal 1
|
||||
ALIGN 16
|
||||
global mangle(private_prefix %+ _%1)
|
||||
mangle(private_prefix %+ _%1):
|
||||
%1:
|
||||
%assign stack_offset stack_offset_bak+gprsize
|
||||
%endmacro
|
||||
|
||||
%macro TRELLIS_BNODE 1 ; ctx_hi
|
||||
clocal trellis_bnode_%1
|
||||
; int j = ctx_hi?1:0;
|
||||
; trellis_node_t *bnode = &nodes_cur[j];
|
||||
; while( ++j < (ctx_hi?8:4) )
|
||||
; if( nodes_cur[j].score < bnode->score )
|
||||
; bnode = &nodes_cur[j];
|
||||
%assign j %1
|
||||
mov rax, [nodes_curq + node_score(j)]
|
||||
lea rax, [rax*8 + j]
|
||||
%rep 3+3*%1
|
||||
%assign j j+1
|
||||
mov r11, [nodes_curq + node_score(j)]
|
||||
lea r11, [r11*8 + j]
|
||||
cmp rax, r11
|
||||
cmova rax, r11
|
||||
%endrep
|
||||
mov r10, dctm
|
||||
and eax, 7
|
||||
ret
|
||||
%endmacro ; TRELLIS_BNODE
|
||||
TRELLIS_BNODE 0
|
||||
TRELLIS_BNODE 1
|
||||
|
||||
|
||||
%macro TRELLIS_COEF0 1 ; ctx_hi
|
||||
clocal trellis_coef0_%1
|
||||
; ssd1 += (uint64_t)cost_sig * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
|
||||
mov r11d, [cost_siglast+0]
|
||||
imul r11, lambda2q
|
||||
shr r11, CABAC_SIZE_BITS - LAMBDA_BITS
|
||||
add r11, [ssd+16]
|
||||
%if %1 == 0
|
||||
; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1;
|
||||
mov scoreq, [nodes_prevq + node_score(0)]
|
||||
add scoreq, [ssd]
|
||||
sub scoreq, r11
|
||||
mov [nodes_curq + node_score(0)], scoreq
|
||||
%endif
|
||||
; memcpy
|
||||
mov scoreq, [nodes_prevq + node_score(1)]
|
||||
mov [nodes_curq + node_score(1)], scoreq
|
||||
mova m1, [nodes_prevq + node_score(2)]
|
||||
mova [nodes_curq + node_score(2)], m1
|
||||
%if %1
|
||||
mova m1, [nodes_prevq + node_score(4)]
|
||||
mova [nodes_curq + node_score(4)], m1
|
||||
mova m1, [nodes_prevq + node_score(6)]
|
||||
mova [nodes_curq + node_score(6)], m1
|
||||
%endif
|
||||
mov r6d, [nodes_prevq + node_cabac_state(3)]
|
||||
mov [nodes_curq + node_cabac_state(3)], r6d
|
||||
%if %1
|
||||
mova m1, [nodes_prevq + node_cabac_state(4)]
|
||||
mova [nodes_curq + node_cabac_state(4)], m1
|
||||
%endif
|
||||
ZERO_LEVEL_IDX %1, prev
|
||||
ret
|
||||
%endmacro ; TRELLIS_COEF0
|
||||
TRELLIS_COEF0 0
|
||||
TRELLIS_COEF0 1
|
||||
|
||||
|
||||
|
||||
%macro START_COEF 1 ; gt1
|
||||
; if( (int64_t)nodes_prev[0].score < 0 ) continue;
|
||||
mov scoreq, [nodes_prevq + node_score(j)]
|
||||
%if j > 0
|
||||
test scoreq, scoreq
|
||||
js .ctx %+ nextj_if_invalid
|
||||
%endif
|
||||
|
||||
; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 );
|
||||
%if j >= 3
|
||||
movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4
|
||||
movzx r11, byte [cabac_transition + r6*2 + %1 GLOBAL]
|
||||
%else
|
||||
movzx r6d, byte [level_statem + coeff_abs_level1_offs]
|
||||
%endif
|
||||
%if %1
|
||||
xor r6d, 1
|
||||
%endif
|
||||
movzx bitsd, word [cabac_entropy + r6*2 GLOBAL]
|
||||
|
||||
; n.score += ssd;
|
||||
; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
|
||||
%if j == 0
|
||||
add scoreq, r10
|
||||
add bitsd, [cost_siglast+8]
|
||||
%else
|
||||
add scoreq, r12
|
||||
add bitsd, [cost_siglast+4]
|
||||
%endif
|
||||
%endmacro ; START_COEF
|
||||
|
||||
%macro END_COEF 1
|
||||
; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
|
||||
imul bitsq, lambda2q
|
||||
shr bitsq, CABAC_SIZE_BITS - LAMBDA_BITS
|
||||
add scoreq, bitsq
|
||||
|
||||
; if( n.score < nodes_cur[node_ctx].score )
|
||||
; SET_LEVEL( n, abs_level );
|
||||
; nodes_cur[node_ctx] = n;
|
||||
cmp scoreq, [nodes_curq + node_score(node_ctx)]
|
||||
jae .ctx %+ nextj_if_valid
|
||||
mov [nodes_curq + node_score(node_ctx)], scoreq
|
||||
%if j == 2 || (j <= 3 && node_ctx == 4)
|
||||
; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states
|
||||
movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed
|
||||
%elif j >= 3
|
||||
; if we have updated before, then copy cabac_state from the parent node
|
||||
mov r6d, [nodes_prevq + node_cabac_state(j)]
|
||||
mov [nodes_curq + node_cabac_state(node_ctx)], r6d
|
||||
%endif
|
||||
%if j >= 3 ; skip the transition if we're not going to reuse the context
|
||||
mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2
|
||||
%endif
|
||||
%if %1 && node_ctx == 7
|
||||
mov r6d, levelgt1_ctxm
|
||||
mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b
|
||||
%endif
|
||||
mov r6d, [nodes_prevq + node_level_idx(j)]
|
||||
%if %1
|
||||
mov r11d, abs_leveld
|
||||
shl r11d, 16
|
||||
or r6d, r11d
|
||||
%else
|
||||
or r6d, 1<<16
|
||||
%endif
|
||||
mov [level_tree + levels_usedq*4], r6d
|
||||
mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd
|
||||
inc levels_usedd
|
||||
%endmacro ; END_COEF
|
||||
|
||||
|
||||
|
||||
%macro COEF1 2
|
||||
%assign j %1
|
||||
%assign nextj_if_valid %1+1
|
||||
%assign nextj_if_invalid %2
|
||||
%if j < 4
|
||||
%assign coeff_abs_level1_offs j+1
|
||||
%else
|
||||
%assign coeff_abs_level1_offs 0
|
||||
%endif
|
||||
%if j < 3
|
||||
%assign node_ctx j+1
|
||||
%else
|
||||
%assign node_ctx j
|
||||
%endif
|
||||
.ctx %+ j:
|
||||
START_COEF 0
|
||||
add bitsd, 1 << CABAC_SIZE_BITS
|
||||
END_COEF 0
|
||||
%endmacro ; COEF1
|
||||
|
||||
%macro COEFN 2
|
||||
%assign j %1
|
||||
%assign nextj_if_valid %2
|
||||
%assign nextj_if_invalid %2
|
||||
%if j < 4
|
||||
%assign coeff_abs_level1_offs j+1
|
||||
%assign coeff_abs_levelgt1_offs 5
|
||||
%else
|
||||
%assign coeff_abs_level1_offs 0
|
||||
%assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc
|
||||
%endif
|
||||
%if j < 4
|
||||
%assign node_ctx 4
|
||||
%elif j < 7
|
||||
%assign node_ctx j+1
|
||||
%else
|
||||
%assign node_ctx 7
|
||||
%endif
|
||||
.ctx %+ j:
|
||||
START_COEF 1
|
||||
; if( abs_level >= 15 )
|
||||
; bits += bs_size_ue_big(...)
|
||||
add bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX
|
||||
; n.cabac_state[levelgt1_ctx]
|
||||
%if j == 7 ; && compiling support for 4:2:2
|
||||
mov r6d, levelgt1_ctxm
|
||||
%define coeff_abs_levelgt1_offs r6
|
||||
%endif
|
||||
%if j == 7
|
||||
movzx r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9
|
||||
%else
|
||||
movzx r10, byte [level_statem + coeff_abs_levelgt1_offs]
|
||||
%endif
|
||||
; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]];
|
||||
add r10d, r1d
|
||||
movzx r6d, word [cabac_size_unary + (r10-128)*2 GLOBAL]
|
||||
add bitsd, r6d
|
||||
%if node_ctx == 7
|
||||
movzx r10, byte [cabac_transition_unary + r10-128 GLOBAL]
|
||||
%endif
|
||||
END_COEF 1
|
||||
%endmacro ; COEFN
|
||||
|
||||
|
||||
|
||||
clocal trellis_coef1
|
||||
.entry0b: ; ctx_lo, larger of the two abs_level candidates
|
||||
mov r10, [ssd+8]
|
||||
sub r10, r11
|
||||
mov r12, [ssd+24]
|
||||
sub r12, r11
|
||||
.entry0: ; ctx_lo, smaller of the two abs_level candidates
|
||||
COEF1 0, 4
|
||||
COEF1 1, 4
|
||||
COEF1 2, 4
|
||||
COEF1 3, 4
|
||||
.ctx4:
|
||||
rep ret
|
||||
.entry1b: ; ctx_hi, larger of the two abs_level candidates
|
||||
mov r12, [ssd+24]
|
||||
sub r12, r11
|
||||
.entry1: ; ctx_hi, smaller of the two abs_level candidates
|
||||
trellis_coef1_hi:
|
||||
COEF1 1, 2
|
||||
COEF1 2, 3
|
||||
COEF1 3, 4
|
||||
COEF1 4, 5
|
||||
COEF1 5, 6
|
||||
COEF1 6, 7
|
||||
COEF1 7, 8
|
||||
.ctx8:
|
||||
rep ret
|
||||
|
||||
%macro COEFN_PREFIX 1
|
||||
; int prefix = X264_MIN( abs_level - 1, 14 );
|
||||
mov r1d, abs_leveld
|
||||
cmp abs_leveld, 15
|
||||
jge .level_suffix%1
|
||||
xor r5d, r5d
|
||||
.skip_level_suffix%1:
|
||||
shl r1d, 7
|
||||
%endmacro
|
||||
|
||||
%macro COEFN_SUFFIX 1
|
||||
.level_suffix%1:
|
||||
; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
|
||||
lea r5d, [abs_levelq-14]
|
||||
bsr r5d, r5d
|
||||
shl r5d, CABAC_SIZE_BITS+1
|
||||
add r5d, 1<<CABAC_SIZE_BITS
|
||||
; int prefix = X264_MIN( abs_level - 1, 14 );
|
||||
mov r1d, 15
|
||||
jmp .skip_level_suffix%1
|
||||
%endmacro
|
||||
|
||||
clocal trellis_coefn
|
||||
.entry0b:
|
||||
mov r10, [ssd+8]
|
||||
mov r12, [ssd+24]
|
||||
inc abs_leveld
|
||||
.entry0:
|
||||
; I could fully separate the ctx_lo and ctx_hi versions of coefn, and then
|
||||
; apply return-on-first-failure to ctx_lo. Or I can use multiple entrypoints
|
||||
; to merge the common portion of ctx_lo and ctx_hi, and thus reduce codesize.
|
||||
; I can't do both, as return-on-first-failure doesn't work for ctx_hi.
|
||||
; The C version has to be fully separate since C doesn't support multiple
|
||||
; entrypoints. But return-on-first-failure isn't very important here (as
|
||||
; opposed to coef1), so I might as well reduce codesize.
|
||||
COEFN_PREFIX 0
|
||||
COEFN 0, 1
|
||||
COEFN 1, 2
|
||||
COEFN 2, 3
|
||||
COEFN 3, 8
|
||||
.ctx8:
|
||||
mov zigzagq, zigzagm ; unspill since r1 was clobbered
|
||||
ret
|
||||
.entry1b:
|
||||
mov r12, [ssd+24]
|
||||
inc abs_leveld
|
||||
.entry1:
|
||||
COEFN_PREFIX 1
|
||||
COEFN 4, 5
|
||||
COEFN 5, 6
|
||||
COEFN 6, 7
|
||||
COEFN 7, 1
|
||||
jmp .ctx1
|
||||
COEFN_SUFFIX 0
|
||||
COEFN_SUFFIX 1
|
||||
259
common/x86/util.h
Normal file
259
common/x86/util.h
Normal file
@@ -0,0 +1,259 @@
|
||||
/*****************************************************************************
|
||||
* util.h: x86 inline asm
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2008-2025 x264 project
|
||||
*
|
||||
* Authors: Fiona Glaser <fiona@x264.com>
|
||||
* Loren Merritt <lorenm@u.washington.edu>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_X86_UTIL_H
|
||||
#define X264_X86_UTIL_H
|
||||
|
||||
#ifdef __SSE__
|
||||
#include <xmmintrin.h>
|
||||
|
||||
#undef M128_ZERO
|
||||
#define M128_ZERO ((__m128){0,0,0,0})
|
||||
#define x264_union128_t x264_union128_sse_t
|
||||
typedef union { __m128 i; uint64_t q[2]; uint32_t d[4]; uint16_t w[8]; uint8_t b[16]; } MAY_ALIAS x264_union128_sse_t;
|
||||
#if HAVE_VECTOREXT
|
||||
typedef uint32_t v4si __attribute__((vector_size (16)));
|
||||
#endif
|
||||
#endif // __SSE__
|
||||
|
||||
#if HAVE_X86_INLINE_ASM && HAVE_MMX
|
||||
|
||||
#define x264_median_mv x264_median_mv_mmx2
|
||||
static ALWAYS_INLINE void x264_median_mv_mmx2( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
|
||||
{
|
||||
asm(
|
||||
"movd %1, %%mm0 \n"
|
||||
"movd %2, %%mm1 \n"
|
||||
"movq %%mm0, %%mm3 \n"
|
||||
"movd %3, %%mm2 \n"
|
||||
"pmaxsw %%mm1, %%mm0 \n"
|
||||
"pminsw %%mm3, %%mm1 \n"
|
||||
"pminsw %%mm2, %%mm0 \n"
|
||||
"pmaxsw %%mm1, %%mm0 \n"
|
||||
"movd %%mm0, %0 \n"
|
||||
:"=m"(*(x264_union32_t*)dst)
|
||||
:"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
|
||||
:"mm0", "mm1", "mm2", "mm3"
|
||||
);
|
||||
}
|
||||
|
||||
#define x264_predictor_difference x264_predictor_difference_mmx2
|
||||
static ALWAYS_INLINE int x264_predictor_difference_mmx2( int16_t (*mvc)[2], intptr_t i_mvc )
|
||||
{
|
||||
int sum;
|
||||
static const uint64_t pw_1 = 0x0001000100010001ULL;
|
||||
|
||||
asm(
|
||||
"pxor %%mm4, %%mm4 \n"
|
||||
"test $1, %1 \n"
|
||||
"jnz 3f \n"
|
||||
"movd -8(%2,%1,4), %%mm0 \n"
|
||||
"movd -4(%2,%1,4), %%mm3 \n"
|
||||
"psubw %%mm3, %%mm0 \n"
|
||||
"jmp 2f \n"
|
||||
"3: \n"
|
||||
"dec %1 \n"
|
||||
"1: \n"
|
||||
"movq -8(%2,%1,4), %%mm0 \n"
|
||||
"psubw -4(%2,%1,4), %%mm0 \n"
|
||||
"2: \n"
|
||||
"sub $2, %1 \n"
|
||||
"pxor %%mm2, %%mm2 \n"
|
||||
"psubw %%mm0, %%mm2 \n"
|
||||
"pmaxsw %%mm2, %%mm0 \n"
|
||||
"paddusw %%mm0, %%mm4 \n"
|
||||
"jg 1b \n"
|
||||
"pmaddwd %4, %%mm4 \n"
|
||||
"pshufw $14, %%mm4, %%mm0 \n"
|
||||
"paddd %%mm0, %%mm4 \n"
|
||||
"movd %%mm4, %0 \n"
|
||||
:"=r"(sum), "+r"(i_mvc)
|
||||
:"r"(mvc), "m"(MEM_DYN( mvc, const int16_t )), "m"(pw_1)
|
||||
:"mm0", "mm2", "mm3", "mm4", "cc"
|
||||
);
|
||||
return sum;
|
||||
}
|
||||
|
||||
#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmx2
|
||||
static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t *mvdtop)
|
||||
{
|
||||
static const uint64_t pb_2 = 0x0202020202020202ULL;
|
||||
static const uint64_t pb_32 = 0x2020202020202020ULL;
|
||||
static const uint64_t pb_33 = 0x2121212121212121ULL;
|
||||
int amvd;
|
||||
asm(
|
||||
"movd %1, %%mm0 \n"
|
||||
"movd %2, %%mm1 \n"
|
||||
"paddusb %%mm1, %%mm0 \n"
|
||||
"pminub %5, %%mm0 \n"
|
||||
"pxor %%mm2, %%mm2 \n"
|
||||
"movq %%mm0, %%mm1 \n"
|
||||
"pcmpgtb %3, %%mm0 \n"
|
||||
"pcmpgtb %4, %%mm1 \n"
|
||||
"psubb %%mm0, %%mm2 \n"
|
||||
"psubb %%mm1, %%mm2 \n"
|
||||
"movd %%mm2, %0 \n"
|
||||
:"=r"(amvd)
|
||||
:"m"(M16( mvdleft )),"m"(M16( mvdtop )),
|
||||
"m"(pb_2),"m"(pb_32),"m"(pb_33)
|
||||
:"mm0", "mm1", "mm2"
|
||||
);
|
||||
return (uint16_t)amvd;
|
||||
}
|
||||
|
||||
#define x264_predictor_clip x264_predictor_clip_mmx2
|
||||
static ALWAYS_INLINE int x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
|
||||
{
|
||||
static const uint32_t pd_32 = 0x20;
|
||||
intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
|
||||
|
||||
asm(
|
||||
"movq (%2), %%mm5 \n"
|
||||
"movd %6, %%mm3 \n"
|
||||
"psllw $2, %%mm5 \n" // Convert to subpel
|
||||
"pshufw $0xEE, %%mm5, %%mm6 \n"
|
||||
"dec %k3 \n"
|
||||
"jz 2f \n" // if( i_mvc == 1 ) {do the last iteration}
|
||||
"punpckldq %%mm3, %%mm3 \n"
|
||||
"punpckldq %%mm5, %%mm5 \n"
|
||||
"movd %7, %%mm4 \n"
|
||||
"lea (%0,%3,4), %3 \n"
|
||||
"1: \n"
|
||||
"movq (%0), %%mm0 \n"
|
||||
"add $8, %0 \n"
|
||||
"movq %%mm3, %%mm1 \n"
|
||||
"pxor %%mm2, %%mm2 \n"
|
||||
"pcmpeqd %%mm0, %%mm1 \n" // mv == pmv
|
||||
"pcmpeqd %%mm0, %%mm2 \n" // mv == 0
|
||||
"por %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1
|
||||
"pmovmskb %%mm2, %k2 \n" // (mv == pmv || mv == 0) * 0xf
|
||||
"pmaxsw %%mm5, %%mm0 \n"
|
||||
"pminsw %%mm6, %%mm0 \n"
|
||||
"pand %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32
|
||||
"psrlq %%mm2, %%mm0 \n" // drop mv0 if it's skipped
|
||||
"movq %%mm0, (%5,%4,4) \n"
|
||||
"and $24, %k2 \n"
|
||||
"add $2, %4 \n"
|
||||
"add $8, %k2 \n"
|
||||
"shr $4, %k2 \n" // (4-val)>>1
|
||||
"sub %2, %4 \n" // +1 for each valid motion vector
|
||||
"cmp %3, %0 \n"
|
||||
"jl 1b \n"
|
||||
"jg 3f \n" // if( i == i_mvc - 1 ) {do the last iteration}
|
||||
|
||||
/* Do the last iteration */
|
||||
"2: \n"
|
||||
"movd (%0), %%mm0 \n"
|
||||
"pxor %%mm2, %%mm2 \n"
|
||||
"pcmpeqd %%mm0, %%mm3 \n"
|
||||
"pcmpeqd %%mm0, %%mm2 \n"
|
||||
"por %%mm3, %%mm2 \n"
|
||||
"pmovmskb %%mm2, %k2 \n"
|
||||
"pmaxsw %%mm5, %%mm0 \n"
|
||||
"pminsw %%mm6, %%mm0 \n"
|
||||
"movd %%mm0, (%5,%4,4) \n"
|
||||
"inc %4 \n"
|
||||
"and $1, %k2 \n"
|
||||
"sub %2, %4 \n" // output += !(mv == pmv || mv == 0)
|
||||
"3: \n"
|
||||
:"+r"(mvc), "=m"(MEM_DYN( dst, int16_t )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
|
||||
:"r"(dst), "g"(pmv), "m"(pd_32), "m"(MEM_DYN( mvc, const int16_t ))
|
||||
:"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "cc"
|
||||
);
|
||||
return i;
|
||||
}
|
||||
|
||||
/* Same as the above, except we do (mv + 2) >> 2 on the input. */
|
||||
#define x264_predictor_roundclip x264_predictor_roundclip_mmx2
|
||||
static ALWAYS_INLINE int x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
|
||||
{
|
||||
static const uint64_t pw_2 = 0x0002000200020002ULL;
|
||||
static const uint32_t pd_32 = 0x20;
|
||||
intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
|
||||
|
||||
asm(
|
||||
"movq (%2), %%mm5 \n"
|
||||
"movq %6, %%mm7 \n"
|
||||
"movd %7, %%mm3 \n"
|
||||
"pshufw $0xEE, %%mm5, %%mm6 \n"
|
||||
"dec %k3 \n"
|
||||
"jz 2f \n"
|
||||
"punpckldq %%mm3, %%mm3 \n"
|
||||
"punpckldq %%mm5, %%mm5 \n"
|
||||
"movd %8, %%mm4 \n"
|
||||
"lea (%0,%3,4), %3 \n"
|
||||
"1: \n"
|
||||
"movq (%0), %%mm0 \n"
|
||||
"add $8, %0 \n"
|
||||
"paddw %%mm7, %%mm0 \n"
|
||||
"psraw $2, %%mm0 \n"
|
||||
"movq %%mm3, %%mm1 \n"
|
||||
"pxor %%mm2, %%mm2 \n"
|
||||
"pcmpeqd %%mm0, %%mm1 \n"
|
||||
"pcmpeqd %%mm0, %%mm2 \n"
|
||||
"por %%mm1, %%mm2 \n"
|
||||
"pmovmskb %%mm2, %k2 \n"
|
||||
"pmaxsw %%mm5, %%mm0 \n"
|
||||
"pminsw %%mm6, %%mm0 \n"
|
||||
"pand %%mm4, %%mm2 \n"
|
||||
"psrlq %%mm2, %%mm0 \n"
|
||||
"movq %%mm0, (%5,%4,4) \n"
|
||||
"and $24, %k2 \n"
|
||||
"add $2, %4 \n"
|
||||
"add $8, %k2 \n"
|
||||
"shr $4, %k2 \n"
|
||||
"sub %2, %4 \n"
|
||||
"cmp %3, %0 \n"
|
||||
"jl 1b \n"
|
||||
"jg 3f \n"
|
||||
|
||||
/* Do the last iteration */
|
||||
"2: \n"
|
||||
"movd (%0), %%mm0 \n"
|
||||
"paddw %%mm7, %%mm0 \n"
|
||||
"psraw $2, %%mm0 \n"
|
||||
"pxor %%mm2, %%mm2 \n"
|
||||
"pcmpeqd %%mm0, %%mm3 \n"
|
||||
"pcmpeqd %%mm0, %%mm2 \n"
|
||||
"por %%mm3, %%mm2 \n"
|
||||
"pmovmskb %%mm2, %k2 \n"
|
||||
"pmaxsw %%mm5, %%mm0 \n"
|
||||
"pminsw %%mm6, %%mm0 \n"
|
||||
"movd %%mm0, (%5,%4,4) \n"
|
||||
"inc %4 \n"
|
||||
"and $1, %k2 \n"
|
||||
"sub %2, %4 \n"
|
||||
"3: \n"
|
||||
:"+r"(mvc), "=m"(MEM_DYN( dst, int16_t )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
|
||||
:"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(MEM_DYN( mvc, const int16_t ))
|
||||
:"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "cc"
|
||||
);
|
||||
return i;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
1992
common/x86/x86inc.asm
Normal file
1992
common/x86/x86inc.asm
Normal file
File diff suppressed because it is too large
Load Diff
937
common/x86/x86util.asm
Normal file
937
common/x86/x86util.asm
Normal file
@@ -0,0 +1,937 @@
|
||||
;*****************************************************************************
|
||||
;* x86util.asm: x86 utility macros
|
||||
;*****************************************************************************
|
||||
;* Copyright (C) 2008-2025 x264 project
|
||||
;*
|
||||
;* Authors: Holger Lubitz <holger@lubitz.org>
|
||||
;* Loren Merritt <lorenm@u.washington.edu>
|
||||
;*
|
||||
;* This program is free software; you can redistribute it and/or modify
|
||||
;* it under the terms of the GNU General Public License as published by
|
||||
;* the Free Software Foundation; either version 2 of the License, or
|
||||
;* (at your option) any later version.
|
||||
;*
|
||||
;* This program is distributed in the hope that it will be useful,
|
||||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;* GNU General Public License for more details.
|
||||
;*
|
||||
;* You should have received a copy of the GNU General Public License
|
||||
;* along with this program; if not, write to the Free Software
|
||||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
;*
|
||||
;* This program is also available under a commercial proprietary license.
|
||||
;* For more information, contact us at licensing@x264.com.
|
||||
;*****************************************************************************
|
||||
|
||||
; like cextern, but with a plain x264 prefix instead of a bitdepth-specific one
|
||||
%macro cextern_common 1
|
||||
%xdefine %1 mangle(x264 %+ _ %+ %1)
|
||||
CAT_XDEFINE cglobaled_, %1, 1
|
||||
extern %1
|
||||
%endmacro
|
||||
|
||||
%ifndef BIT_DEPTH
|
||||
%assign BIT_DEPTH 0
|
||||
%endif
|
||||
|
||||
%if BIT_DEPTH > 8
|
||||
%assign HIGH_BIT_DEPTH 1
|
||||
%else
|
||||
%assign HIGH_BIT_DEPTH 0
|
||||
%endif
|
||||
|
||||
%assign FENC_STRIDE 16
|
||||
%assign FDEC_STRIDE 32
|
||||
|
||||
%assign SIZEOF_PIXEL 1
|
||||
%assign SIZEOF_DCTCOEF 2
|
||||
%define pixel byte
|
||||
%define vpbroadcastdct vpbroadcastw
|
||||
%define vpbroadcastpix vpbroadcastb
|
||||
%if HIGH_BIT_DEPTH
|
||||
%assign SIZEOF_PIXEL 2
|
||||
%assign SIZEOF_DCTCOEF 4
|
||||
%define pixel word
|
||||
%define vpbroadcastdct vpbroadcastd
|
||||
%define vpbroadcastpix vpbroadcastw
|
||||
%endif
|
||||
|
||||
%assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE
|
||||
%assign FDEC_STRIDEB SIZEOF_PIXEL*FDEC_STRIDE
|
||||
|
||||
%assign PIXEL_MAX ((1 << BIT_DEPTH)-1)
|
||||
|
||||
%macro FIX_STRIDES 1-*
|
||||
%if HIGH_BIT_DEPTH
|
||||
%rep %0
|
||||
add %1, %1
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SBUTTERFLY 4
|
||||
%ifidn %1, dqqq
|
||||
vperm2i128 m%4, m%2, m%3, q0301 ; punpckh
|
||||
vinserti128 m%2, m%2, xm%3, 1 ; punpckl
|
||||
%elif avx_enabled && mmsize >= 16
|
||||
punpckh%1 m%4, m%2, m%3
|
||||
punpckl%1 m%2, m%3
|
||||
%else
|
||||
mova m%4, m%2
|
||||
punpckl%1 m%2, m%3
|
||||
punpckh%1 m%4, m%3
|
||||
%endif
|
||||
SWAP %3, %4
|
||||
%endmacro
|
||||
|
||||
%macro SBUTTERFLY2 4
|
||||
punpckl%1 m%4, m%2, m%3
|
||||
punpckh%1 m%2, m%2, m%3
|
||||
SWAP %2, %4, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE4x4W 5
|
||||
SBUTTERFLY wd, %1, %2, %5
|
||||
SBUTTERFLY wd, %3, %4, %5
|
||||
SBUTTERFLY dq, %1, %3, %5
|
||||
SBUTTERFLY dq, %2, %4, %5
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE2x4x4W 5
|
||||
SBUTTERFLY wd, %1, %2, %5
|
||||
SBUTTERFLY wd, %3, %4, %5
|
||||
SBUTTERFLY dq, %1, %3, %5
|
||||
SBUTTERFLY dq, %2, %4, %5
|
||||
SBUTTERFLY qdq, %1, %2, %5
|
||||
SBUTTERFLY qdq, %3, %4, %5
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE4x4D 5
|
||||
SBUTTERFLY dq, %1, %2, %5
|
||||
SBUTTERFLY dq, %3, %4, %5
|
||||
SBUTTERFLY qdq, %1, %3, %5
|
||||
SBUTTERFLY qdq, %2, %4, %5
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro TRANSPOSE8x8W 9-11
|
||||
%if ARCH_X86_64
|
||||
SBUTTERFLY wd, %1, %2, %9
|
||||
SBUTTERFLY wd, %3, %4, %9
|
||||
SBUTTERFLY wd, %5, %6, %9
|
||||
SBUTTERFLY wd, %7, %8, %9
|
||||
SBUTTERFLY dq, %1, %3, %9
|
||||
SBUTTERFLY dq, %2, %4, %9
|
||||
SBUTTERFLY dq, %5, %7, %9
|
||||
SBUTTERFLY dq, %6, %8, %9
|
||||
SBUTTERFLY qdq, %1, %5, %9
|
||||
SBUTTERFLY qdq, %2, %6, %9
|
||||
SBUTTERFLY qdq, %3, %7, %9
|
||||
SBUTTERFLY qdq, %4, %8, %9
|
||||
SWAP %2, %5
|
||||
SWAP %4, %7
|
||||
%else
|
||||
; in: m0..m7, unless %11 in which case m6 is in %9
|
||||
; out: m0..m7, unless %11 in which case m4 is in %10
|
||||
; spills into %9 and %10
|
||||
%if %0<11
|
||||
movdqa %9, m%7
|
||||
%endif
|
||||
SBUTTERFLY wd, %1, %2, %7
|
||||
movdqa %10, m%2
|
||||
movdqa m%7, %9
|
||||
SBUTTERFLY wd, %3, %4, %2
|
||||
SBUTTERFLY wd, %5, %6, %2
|
||||
SBUTTERFLY wd, %7, %8, %2
|
||||
SBUTTERFLY dq, %1, %3, %2
|
||||
movdqa %9, m%3
|
||||
movdqa m%2, %10
|
||||
SBUTTERFLY dq, %2, %4, %3
|
||||
SBUTTERFLY dq, %5, %7, %3
|
||||
SBUTTERFLY dq, %6, %8, %3
|
||||
SBUTTERFLY qdq, %1, %5, %3
|
||||
SBUTTERFLY qdq, %2, %6, %3
|
||||
movdqa %10, m%2
|
||||
movdqa m%3, %9
|
||||
SBUTTERFLY qdq, %3, %7, %2
|
||||
SBUTTERFLY qdq, %4, %8, %2
|
||||
SWAP %2, %5
|
||||
SWAP %4, %7
|
||||
%if %0<11
|
||||
movdqa m%5, %10
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro WIDEN_SXWD 2
|
||||
punpckhwd m%2, m%1
|
||||
psrad m%2, 16
|
||||
%if cpuflag(sse4)
|
||||
pmovsxwd m%1, m%1
|
||||
%else
|
||||
punpcklwd m%1, m%1
|
||||
psrad m%1, 16
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src)
|
||||
%if cpuflag(ssse3)
|
||||
pabsw %1, %2
|
||||
%elifidn %3, sign ; version for pairing with PSIGNW: modifies src
|
||||
pxor %1, %1
|
||||
pcmpgtw %1, %2
|
||||
pxor %2, %1
|
||||
psubw %2, %1
|
||||
SWAP %1, %2
|
||||
%elifidn %1, %2
|
||||
pxor %3, %3
|
||||
psubw %3, %1
|
||||
pmaxsw %1, %3
|
||||
%elifid %2
|
||||
pxor %1, %1
|
||||
psubw %1, %2
|
||||
pmaxsw %1, %2
|
||||
%elif %0 == 2
|
||||
pxor %1, %1
|
||||
psubw %1, %2
|
||||
pmaxsw %1, %2
|
||||
%else
|
||||
mova %1, %2
|
||||
pxor %3, %3
|
||||
psubw %3, %1
|
||||
pmaxsw %1, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABSW2 6 ; dst1, dst2, src1, src2, tmp, tmp
|
||||
%if cpuflag(ssse3)
|
||||
pabsw %1, %3
|
||||
pabsw %2, %4
|
||||
%elifidn %1, %3
|
||||
pxor %5, %5
|
||||
pxor %6, %6
|
||||
psubw %5, %1
|
||||
psubw %6, %2
|
||||
pmaxsw %1, %5
|
||||
pmaxsw %2, %6
|
||||
%else
|
||||
pxor %1, %1
|
||||
pxor %2, %2
|
||||
psubw %1, %3
|
||||
psubw %2, %4
|
||||
pmaxsw %1, %3
|
||||
pmaxsw %2, %4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABSB 2
|
||||
%if cpuflag(ssse3)
|
||||
pabsb %1, %1
|
||||
%else
|
||||
pxor %2, %2
|
||||
psubb %2, %1
|
||||
pminub %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro ABSD 2-3
|
||||
%if cpuflag(ssse3)
|
||||
pabsd %1, %2
|
||||
%else
|
||||
%define %%s %2
|
||||
%if %0 == 3
|
||||
mova %3, %2
|
||||
%define %%s %3
|
||||
%endif
|
||||
pxor %1, %1
|
||||
pcmpgtd %1, %%s
|
||||
pxor %%s, %1
|
||||
psubd %%s, %1
|
||||
SWAP %1, %%s
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PSIGN 3-4
|
||||
%if cpuflag(ssse3) && %0 == 4
|
||||
psign%1 %2, %3, %4
|
||||
%elif cpuflag(ssse3)
|
||||
psign%1 %2, %3
|
||||
%elif %0 == 4
|
||||
pxor %2, %3, %4
|
||||
psub%1 %2, %4
|
||||
%else
|
||||
pxor %2, %3
|
||||
psub%1 %2, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%define PSIGNW PSIGN w,
|
||||
%define PSIGND PSIGN d,
|
||||
|
||||
%macro SPLATB_LOAD 3
|
||||
%if cpuflag(ssse3)
|
||||
movd %1, [%2-3]
|
||||
pshufb %1, %3
|
||||
%else
|
||||
movd %1, [%2-3] ;to avoid crossing a cacheline
|
||||
punpcklbw %1, %1
|
||||
SPLATW %1, %1, 3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%imacro SPLATW 2-3 0
|
||||
%if cpuflag(avx2) && %3 == 0
|
||||
vpbroadcastw %1, %2
|
||||
%else
|
||||
%define %%s %2
|
||||
%ifid %2
|
||||
%define %%s xmm%2
|
||||
%elif %3 == 0
|
||||
movd xmm%1, %2
|
||||
%define %%s xmm%1
|
||||
%endif
|
||||
PSHUFLW xmm%1, %%s, (%3)*q1111
|
||||
%if mmsize >= 32
|
||||
vpbroadcastq %1, xmm%1
|
||||
%elif mmsize == 16
|
||||
punpcklqdq %1, %1
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%imacro SPLATD 2-3 0
|
||||
%if cpuflag(avx2) && %3 == 0
|
||||
vpbroadcastd %1, %2
|
||||
%else
|
||||
%define %%s %2
|
||||
%ifid %2
|
||||
%define %%s xmm%2
|
||||
%elif %3 == 0
|
||||
movd xmm%1, %2
|
||||
%define %%s xmm%1
|
||||
%endif
|
||||
%if mmsize == 8 && %3 == 0
|
||||
%ifidn %1, %%s
|
||||
punpckldq %1, %1
|
||||
%else
|
||||
pshufw %1, %%s, q1010
|
||||
%endif
|
||||
%elif mmsize == 8 && %3 == 1
|
||||
%ifidn %1, %%s
|
||||
punpckhdq %1, %1
|
||||
%else
|
||||
pshufw %1, %%s, q3232
|
||||
%endif
|
||||
%else
|
||||
pshufd xmm%1, %%s, (%3)*q1111
|
||||
%endif
|
||||
%if mmsize >= 32
|
||||
vpbroadcastq %1, xmm%1
|
||||
%endif
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro CLIPW 3 ;(dst, min, max)
|
||||
pmaxsw %1, %2
|
||||
pminsw %1, %3
|
||||
%endmacro
|
||||
|
||||
%macro MOVHL 2 ; dst, src
|
||||
%ifidn %1, %2
|
||||
punpckhqdq %1, %2
|
||||
%elif cpuflag(avx)
|
||||
punpckhqdq %1, %2, %2
|
||||
%elif cpuflag(sse4)
|
||||
pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
|
||||
%else
|
||||
movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro HADDD 2 ; sum junk
|
||||
%if sizeof%1 >= 64
|
||||
vextracti32x8 ymm%2, zmm%1, 1
|
||||
paddd ymm%1, ymm%2
|
||||
%endif
|
||||
%if sizeof%1 >= 32
|
||||
vextracti128 xmm%2, ymm%1, 1
|
||||
paddd xmm%1, xmm%2
|
||||
%endif
|
||||
%if sizeof%1 >= 16
|
||||
MOVHL xmm%2, xmm%1
|
||||
paddd xmm%1, xmm%2
|
||||
%endif
|
||||
%if cpuflag(xop) && sizeof%1 == 16
|
||||
vphadddq xmm%1, xmm%1
|
||||
%else
|
||||
PSHUFLW xmm%2, xmm%1, q1032
|
||||
paddd xmm%1, xmm%2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro HADDW 2 ; reg, tmp
|
||||
%if cpuflag(xop) && sizeof%1 == 16
|
||||
vphaddwq %1, %1
|
||||
MOVHL %2, %1
|
||||
paddd %1, %2
|
||||
%else
|
||||
pmaddwd %1, [pw_1]
|
||||
HADDD %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro HADDUWD 2
|
||||
%if cpuflag(xop) && sizeof%1 == 16
|
||||
vphadduwd %1, %1
|
||||
%else
|
||||
psrld %2, %1, 16
|
||||
pslld %1, 16
|
||||
psrld %1, 16
|
||||
paddd %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro HADDUW 2
|
||||
%if cpuflag(xop) && sizeof%1 == 16
|
||||
vphadduwq %1, %1
|
||||
MOVHL %2, %1
|
||||
paddd %1, %2
|
||||
%else
|
||||
HADDUWD %1, %2
|
||||
HADDD %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
|
||||
; AVX2 version uses a precalculated extra input that
|
||||
; can be re-used across calls
|
||||
%if sizeof%1==32
|
||||
; %3 = abcdefgh ijklmnop (lower address)
|
||||
; %2 = ABCDEFGH IJKLMNOP (higher address)
|
||||
; vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH
|
||||
%if %4 < 16
|
||||
palignr %1, %5, %3, %4 ; %1 = bcdefghi jklmnopA
|
||||
%else
|
||||
palignr %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO
|
||||
%endif
|
||||
%elif cpuflag(ssse3)
|
||||
%if %0==5
|
||||
palignr %1, %2, %3, %4
|
||||
%else
|
||||
palignr %1, %2, %3
|
||||
%endif
|
||||
%else
|
||||
%define %%dst %1
|
||||
%if %0==5
|
||||
%ifnidn %1, %2
|
||||
mova %%dst, %2
|
||||
%endif
|
||||
%rotate 1
|
||||
%endif
|
||||
%ifnidn %4, %2
|
||||
mova %4, %2
|
||||
%endif
|
||||
%if mmsize==8
|
||||
psllq %%dst, (8-%3)*8
|
||||
psrlq %4, %3*8
|
||||
%else
|
||||
pslldq %%dst, 16-%3
|
||||
psrldq %4, %3
|
||||
%endif
|
||||
por %%dst, %4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro PSHUFLW 1+
|
||||
%if mmsize == 8
|
||||
pshufw %1
|
||||
%else
|
||||
pshuflw %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes
|
||||
; values shifted in are undefined
|
||||
; faster if dst==src
|
||||
%define PSLLPIX PSXLPIX l, -1, ;dst, src, shift
|
||||
%define PSRLPIX PSXLPIX r, 1, ;dst, src, shift
|
||||
%macro PSXLPIX 5
|
||||
%if mmsize == 8
|
||||
%if %5&1
|
||||
ps%1lq %3, %4, %5*8
|
||||
%else
|
||||
pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff
|
||||
%endif
|
||||
%else
|
||||
ps%1ldq %3, %4, %5*2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
|
||||
%ifnum %5
|
||||
pand m%3, m%5, m%4 ; src .. y6 .. y4
|
||||
pand m%1, m%5, m%2 ; dst .. y6 .. y4
|
||||
%else
|
||||
mova m%1, %5
|
||||
pand m%3, m%1, m%4 ; src .. y6 .. y4
|
||||
pand m%1, m%1, m%2 ; dst .. y6 .. y4
|
||||
%endif
|
||||
psrlw m%2, 8 ; dst .. y7 .. y5
|
||||
psrlw m%4, 8 ; src .. y7 .. y5
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUB_BA 3-4
|
||||
%if %0==3
|
||||
padd%1 m%2, m%3
|
||||
padd%1 m%3, m%3
|
||||
psub%1 m%3, m%2
|
||||
%elif avx_enabled
|
||||
padd%1 m%4, m%2, m%3
|
||||
psub%1 m%3, m%2
|
||||
SWAP %2, %4
|
||||
%else
|
||||
mova m%4, m%2
|
||||
padd%1 m%2, m%3
|
||||
psub%1 m%3, m%4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUB_BADC 5-6
|
||||
%if %0==6
|
||||
SUMSUB_BA %1, %2, %3, %6
|
||||
SUMSUB_BA %1, %4, %5, %6
|
||||
%else
|
||||
padd%1 m%2, m%3
|
||||
padd%1 m%4, m%5
|
||||
padd%1 m%3, m%3
|
||||
padd%1 m%5, m%5
|
||||
psub%1 m%3, m%2
|
||||
psub%1 m%5, m%4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD4_V 4+
|
||||
SUMSUB_BADC w, %1, %2, %3, %4
|
||||
SUMSUB_BADC w, %1, %3, %2, %4
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD8_V 8+
|
||||
SUMSUB_BADC w, %1, %2, %3, %4
|
||||
SUMSUB_BADC w, %5, %6, %7, %8
|
||||
SUMSUB_BADC w, %1, %3, %2, %4
|
||||
SUMSUB_BADC w, %5, %7, %6, %8
|
||||
SUMSUB_BADC w, %1, %5, %2, %6
|
||||
SUMSUB_BADC w, %3, %7, %4, %8
|
||||
%endmacro
|
||||
|
||||
%macro TRANS_SSE2 5-6
|
||||
; TRANSPOSE2x2
|
||||
; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
|
||||
; %2: ord/unord (for compat with sse4, unused)
|
||||
; %3/%4: source regs
|
||||
; %5/%6: tmp regs
|
||||
%ifidn %1, d
|
||||
%define mask [mask_10]
|
||||
%define shift 16
|
||||
%elifidn %1, q
|
||||
%define mask [mask_1100]
|
||||
%define shift 32
|
||||
%endif
|
||||
%if %0==6 ; less dependency if we have two tmp
|
||||
mova m%5, mask ; ff00
|
||||
mova m%6, m%4 ; x5x4
|
||||
psll%1 m%4, shift ; x4..
|
||||
pand m%6, m%5 ; x5..
|
||||
pandn m%5, m%3 ; ..x0
|
||||
psrl%1 m%3, shift ; ..x1
|
||||
por m%4, m%5 ; x4x0
|
||||
por m%3, m%6 ; x5x1
|
||||
%else ; more dependency, one insn less. sometimes faster, sometimes not
|
||||
mova m%5, m%4 ; x5x4
|
||||
psll%1 m%4, shift ; x4..
|
||||
pxor m%4, m%3 ; (x4^x1)x0
|
||||
pand m%4, mask ; (x4^x1)..
|
||||
pxor m%3, m%4 ; x4x0
|
||||
psrl%1 m%4, shift ; ..(x1^x4)
|
||||
pxor m%5, m%4 ; x5x1
|
||||
SWAP %4, %3, %5
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro TRANS_SSE4 5-6 ; see above
|
||||
%ifidn %1, d
|
||||
%ifidn %2, ord
|
||||
psrl%1 m%5, m%3, 16
|
||||
pblendw m%5, m%4, q2222
|
||||
psll%1 m%4, 16
|
||||
pblendw m%4, m%3, q1111
|
||||
SWAP %3, %5
|
||||
%else
|
||||
%if avx_enabled
|
||||
pblendw m%5, m%3, m%4, q2222
|
||||
SWAP %3, %5
|
||||
%else
|
||||
mova m%5, m%3
|
||||
pblendw m%3, m%4, q2222
|
||||
%endif
|
||||
psll%1 m%4, 16
|
||||
psrl%1 m%5, 16
|
||||
por m%4, m%5
|
||||
%endif
|
||||
%elifidn %1, q
|
||||
shufps m%5, m%3, m%4, q3131
|
||||
shufps m%3, m%3, m%4, q2020
|
||||
SWAP %4, %5
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro TRANS_XOP 5-6
|
||||
%ifidn %1, d
|
||||
vpperm m%5, m%3, m%4, [transd_shuf1]
|
||||
vpperm m%3, m%3, m%4, [transd_shuf2]
|
||||
%elifidn %1, q
|
||||
shufps m%5, m%3, m%4, q3131
|
||||
shufps m%3, m%4, q2020
|
||||
%endif
|
||||
SWAP %4, %5
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD 5-6
|
||||
; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
|
||||
; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
|
||||
; %3/%4: regs
|
||||
; %5(%6): tmpregs
|
||||
%if %1!=0 ; have to reorder stuff for horizontal op
|
||||
%ifidn %2, sumsub
|
||||
%define ORDER ord
|
||||
; sumsub needs order because a-b != b-a unless a=b
|
||||
%else
|
||||
%define ORDER unord
|
||||
; if we just max, order doesn't matter (allows pblendw+or in sse4)
|
||||
%endif
|
||||
%if %1==1
|
||||
TRANS d, ORDER, %3, %4, %5, %6
|
||||
%elif %1==2
|
||||
%if mmsize==8
|
||||
SBUTTERFLY dq, %3, %4, %5
|
||||
%elif %0==6
|
||||
TRANS q, ORDER, %3, %4, %5, %6
|
||||
%else
|
||||
TRANS q, ORDER, %3, %4, %5
|
||||
%endif
|
||||
%elif %1==4
|
||||
SBUTTERFLY qdq, %3, %4, %5
|
||||
%elif %1==8
|
||||
SBUTTERFLY dqqq, %3, %4, %5
|
||||
%endif
|
||||
%endif
|
||||
%ifidn %2, sumsub
|
||||
SUMSUB_BA w, %3, %4, %5
|
||||
%else
|
||||
%ifidn %2, amax
|
||||
%if %0==6
|
||||
ABSW2 m%3, m%4, m%3, m%4, m%5, m%6
|
||||
%else
|
||||
ABSW m%3, m%3, m%5
|
||||
ABSW m%4, m%4, m%5
|
||||
%endif
|
||||
%endif
|
||||
pmaxsw m%3, m%4
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro HADAMARD2_2D 6-7 sumsub
|
||||
HADAMARD 0, sumsub, %1, %2, %5
|
||||
HADAMARD 0, sumsub, %3, %4, %5
|
||||
SBUTTERFLY %6, %1, %2, %5
|
||||
%ifnum %7
|
||||
HADAMARD 0, amax, %1, %2, %5, %7
|
||||
%else
|
||||
HADAMARD 0, %7, %1, %2, %5
|
||||
%endif
|
||||
SBUTTERFLY %6, %3, %4, %5
|
||||
%ifnum %7
|
||||
HADAMARD 0, amax, %3, %4, %5, %7
|
||||
%else
|
||||
HADAMARD 0, %7, %3, %4, %5
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD4_2D 5-6 sumsub
|
||||
HADAMARD2_2D %1, %2, %3, %4, %5, wd
|
||||
HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
|
||||
SWAP %2, %3
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD4_2D_SSE 5-6 sumsub
|
||||
HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
|
||||
HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
|
||||
SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0
|
||||
SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2
|
||||
HADAMARD2_2D %1, %3, %2, %4, %5, dq
|
||||
SBUTTERFLY qdq, %1, %2, %5
|
||||
HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1
|
||||
SBUTTERFLY qdq, %3, %4, %5
|
||||
HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD8_2D 9-10 sumsub
|
||||
HADAMARD2_2D %1, %2, %3, %4, %9, wd
|
||||
HADAMARD2_2D %5, %6, %7, %8, %9, wd
|
||||
HADAMARD2_2D %1, %3, %2, %4, %9, dq
|
||||
HADAMARD2_2D %5, %7, %6, %8, %9, dq
|
||||
HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
|
||||
HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
|
||||
%ifnidn %10, amax
|
||||
SWAP %2, %5
|
||||
SWAP %4, %7
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; doesn't include the "pmaddubsw hmul_8p" pass
|
||||
%macro HADAMARD8_2D_HMUL 10
|
||||
HADAMARD4_V %1, %2, %3, %4, %9
|
||||
HADAMARD4_V %5, %6, %7, %8, %9
|
||||
SUMSUB_BADC w, %1, %5, %2, %6, %9
|
||||
HADAMARD 2, sumsub, %1, %5, %9, %10
|
||||
HADAMARD 2, sumsub, %2, %6, %9, %10
|
||||
SUMSUB_BADC w, %3, %7, %4, %8, %9
|
||||
HADAMARD 2, sumsub, %3, %7, %9, %10
|
||||
HADAMARD 2, sumsub, %4, %8, %9, %10
|
||||
HADAMARD 1, amax, %1, %5, %9, %10
|
||||
HADAMARD 1, amax, %2, %6, %9, %5
|
||||
HADAMARD 1, amax, %3, %7, %9, %5
|
||||
HADAMARD 1, amax, %4, %8, %9, %5
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUB2_AB 4
|
||||
%if cpuflag(xop)
|
||||
pmacs%1%1 m%4, m%3, [p%1_m2], m%2
|
||||
pmacs%1%1 m%2, m%2, [p%1_2], m%3
|
||||
%elifnum %3
|
||||
psub%1 m%4, m%2, m%3
|
||||
psub%1 m%4, m%3
|
||||
padd%1 m%2, m%2
|
||||
padd%1 m%2, m%3
|
||||
%else
|
||||
mova m%4, m%2
|
||||
padd%1 m%2, m%2
|
||||
padd%1 m%2, %3
|
||||
psub%1 m%4, %3
|
||||
psub%1 m%4, %3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SUMSUBD2_AB 5
|
||||
%ifnum %4
|
||||
psra%1 m%5, m%2, 1 ; %3: %3>>1
|
||||
psra%1 m%4, m%3, 1 ; %2: %2>>1
|
||||
padd%1 m%4, m%2 ; %3: %3>>1+%2
|
||||
psub%1 m%5, m%3 ; %2: %2>>1-%3
|
||||
SWAP %2, %5
|
||||
SWAP %3, %4
|
||||
%else
|
||||
mova %5, m%2
|
||||
mova %4, m%3
|
||||
psra%1 m%3, 1 ; %3: %3>>1
|
||||
psra%1 m%2, 1 ; %2: %2>>1
|
||||
padd%1 m%3, %5 ; %3: %3>>1+%2
|
||||
psub%1 m%2, %4 ; %2: %2>>1-%3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro DCT4_1D 5
|
||||
%ifnum %5
|
||||
SUMSUB_BADC w, %4, %1, %3, %2, %5
|
||||
SUMSUB_BA w, %3, %4, %5
|
||||
SUMSUB2_AB w, %1, %2, %5
|
||||
SWAP %1, %3, %4, %5, %2
|
||||
%else
|
||||
SUMSUB_BADC w, %4, %1, %3, %2
|
||||
SUMSUB_BA w, %3, %4
|
||||
mova [%5], m%2
|
||||
SUMSUB2_AB w, %1, [%5], %2
|
||||
SWAP %1, %3, %4, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro IDCT4_1D 6-7
|
||||
%ifnum %6
|
||||
SUMSUBD2_AB %1, %3, %5, %7, %6
|
||||
; %3: %3>>1-%5 %5: %3+%5>>1
|
||||
SUMSUB_BA %1, %4, %2, %7
|
||||
; %4: %2+%4 %2: %2-%4
|
||||
SUMSUB_BADC %1, %5, %4, %3, %2, %7
|
||||
; %5: %2+%4 + (%3+%5>>1)
|
||||
; %4: %2+%4 - (%3+%5>>1)
|
||||
; %3: %2-%4 + (%3>>1-%5)
|
||||
; %2: %2-%4 - (%3>>1-%5)
|
||||
%else
|
||||
%ifidn %1, w
|
||||
SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
|
||||
%else
|
||||
SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
|
||||
%endif
|
||||
SUMSUB_BA %1, %4, %2
|
||||
SUMSUB_BADC %1, %5, %4, %3, %2
|
||||
%endif
|
||||
SWAP %2, %5, %4
|
||||
; %2: %2+%4 + (%3+%5>>1) row0
|
||||
; %3: %2-%4 + (%3>>1-%5) row1
|
||||
; %4: %2-%4 - (%3>>1-%5) row2
|
||||
; %5: %2+%4 - (%3+%5>>1) row3
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro LOAD_DIFF 5-6 1
|
||||
%if HIGH_BIT_DEPTH
|
||||
%if %6 ; %5 aligned?
|
||||
mova %1, %4
|
||||
psubw %1, %5
|
||||
%elif cpuflag(avx)
|
||||
movu %1, %4
|
||||
psubw %1, %5
|
||||
%else
|
||||
movu %1, %4
|
||||
movu %2, %5
|
||||
psubw %1, %2
|
||||
%endif
|
||||
%else ; !HIGH_BIT_DEPTH
|
||||
movh %1, %4
|
||||
movh %2, %5
|
||||
%ifidn %3, none
|
||||
punpcklbw %1, %2
|
||||
punpcklbw %2, %2
|
||||
%else
|
||||
punpcklbw %1, %3
|
||||
punpcklbw %2, %3
|
||||
%endif
|
||||
psubw %1, %2
|
||||
%endif ; HIGH_BIT_DEPTH
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
|
||||
%if BIT_DEPTH == 8 && cpuflag(ssse3)
|
||||
movh m%2, [%8+%1*FDEC_STRIDE]
|
||||
movh m%1, [%7+%1*FENC_STRIDE]
|
||||
punpcklbw m%1, m%2
|
||||
movh m%3, [%8+%2*FDEC_STRIDE]
|
||||
movh m%2, [%7+%2*FENC_STRIDE]
|
||||
punpcklbw m%2, m%3
|
||||
movh m%4, [%8+%3*FDEC_STRIDE]
|
||||
movh m%3, [%7+%3*FENC_STRIDE]
|
||||
punpcklbw m%3, m%4
|
||||
movh m%5, [%8+%4*FDEC_STRIDE]
|
||||
movh m%4, [%7+%4*FENC_STRIDE]
|
||||
punpcklbw m%4, m%5
|
||||
pmaddubsw m%1, m%6
|
||||
pmaddubsw m%2, m%6
|
||||
pmaddubsw m%3, m%6
|
||||
pmaddubsw m%4, m%6
|
||||
%else
|
||||
LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB]
|
||||
LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB]
|
||||
LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB]
|
||||
LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro STORE_DCT 6
|
||||
movq [%5+%6+ 0], m%1
|
||||
movq [%5+%6+ 8], m%2
|
||||
movq [%5+%6+16], m%3
|
||||
movq [%5+%6+24], m%4
|
||||
movhps [%5+%6+32], m%1
|
||||
movhps [%5+%6+40], m%2
|
||||
movhps [%5+%6+48], m%3
|
||||
movhps [%5+%6+56], m%4
|
||||
%endmacro
|
||||
|
||||
%macro STORE_IDCT 4
|
||||
movhps [r0-4*FDEC_STRIDE], %1
|
||||
movh [r0-3*FDEC_STRIDE], %1
|
||||
movhps [r0-2*FDEC_STRIDE], %2
|
||||
movh [r0-1*FDEC_STRIDE], %2
|
||||
movhps [r0+0*FDEC_STRIDE], %3
|
||||
movh [r0+1*FDEC_STRIDE], %3
|
||||
movhps [r0+2*FDEC_STRIDE], %4
|
||||
movh [r0+3*FDEC_STRIDE], %4
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned?
|
||||
LOAD_DIFF m%1, m%5, m%7, [%8], [%9], %11
|
||||
LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3], %11
|
||||
LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11
|
||||
LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5], %11
|
||||
%if %10
|
||||
lea %8, [%8+4*r1]
|
||||
lea %9, [%9+4*r3]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; 2xdst, 2xtmp, 2xsrcrow
|
||||
%macro LOAD_DIFF16x2_AVX2 6
|
||||
pmovzxbw m%1, [r1+%5*FENC_STRIDE]
|
||||
pmovzxbw m%2, [r1+%6*FENC_STRIDE]
|
||||
pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE]
|
||||
pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE]
|
||||
psubw m%1, m%3
|
||||
psubw m%2, m%4
|
||||
%endmacro
|
||||
|
||||
%macro DIFFx2 6-7
|
||||
movh %3, %5
|
||||
punpcklbw %3, %4
|
||||
psraw %1, 6
|
||||
paddsw %1, %3
|
||||
movh %3, %6
|
||||
punpcklbw %3, %4
|
||||
psraw %2, 6
|
||||
paddsw %2, %3
|
||||
packuswb %2, %1
|
||||
%endmacro
|
||||
|
||||
; (high depth) in: %1, %2, min to clip, max to clip, mem128
|
||||
; in: %1, tmp, %3, mem64
|
||||
%macro STORE_DIFF 4-5
|
||||
%if HIGH_BIT_DEPTH
|
||||
psrad %1, 6
|
||||
psrad %2, 6
|
||||
packssdw %1, %2
|
||||
paddw %1, %5
|
||||
CLIPW %1, %3, %4
|
||||
mova %5, %1
|
||||
%else
|
||||
movh %2, %4
|
||||
punpcklbw %2, %3
|
||||
psraw %1, 6
|
||||
paddsw %1, %2
|
||||
packuswb %1, %1
|
||||
movh %4, %1
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SHUFFLE_MASK_W 8
|
||||
%rep 8
|
||||
%if %1>=0x80
|
||||
db %1, %1
|
||||
%else
|
||||
db %1*2
|
||||
db %1*2+1
|
||||
%endif
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
; instruction, accum, input, iteration (zero to swap, nonzero to add)
|
||||
%macro ACCUM 4
|
||||
%if %4
|
||||
%1 m%2, m%3
|
||||
%else
|
||||
SWAP %2, %3
|
||||
%endif
|
||||
%endmacro
|
||||
Reference in New Issue
Block a user