x264 source for verification 2026-05-22

This commit is contained in:
2026-05-22 16:45:04 +08:00
commit 4647f166e5
270 changed files with 166522 additions and 0 deletions

136
common/x86/bitstream-a.asm Normal file
View File

@@ -0,0 +1,136 @@
;*****************************************************************************
;* bitstream-a.asm: x86 bitstream functions
;*****************************************************************************
;* Copyright (C) 2010-2025 x264 project
;*
;* Authors: Fiona Glaser <fiona@x264.com>
;* Henrik Gramner <henrik@gramner.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION .text
;-----------------------------------------------------------------------------
; uint8_t *x264_nal_escape( uint8_t *dst, uint8_t *src, uint8_t *end )
;-----------------------------------------------------------------------------
%macro NAL_LOOP 2
%%escape:
; Detect false positive to avoid unnecessary escape loop
xor r3d, r3d
cmp byte [r0+r1-1], 0
setnz r3b
xor k3, k4
jnz .escape
jmp %%continue
ALIGN 16
%1:
mova [r0+r1+mmsize], m1
pcmpeqb m1, m0
mova [r0+r1], m2
pcmpeqb m2, m0
pmovmskb r3d, m1
%2 m1, [r1+r2+3*mmsize]
pmovmskb r4d, m2
%2 m2, [r1+r2+2*mmsize]
shl k3, mmsize
or k3, k4
lea k4, [2*r3+1]
and k4, k3
jnz %%escape
%%continue:
add r1, 2*mmsize
jl %1
%endmacro
%macro NAL_ESCAPE 0
%if mmsize == 32
%xdefine k3 r3
%xdefine k4 r4
%else
%xdefine k3 r3d
%xdefine k4 r4d
%endif
cglobal nal_escape, 3,5
movzx r3d, byte [r1]
sub r1, r2 ; r1 = offset of current src pointer from end of src
pxor m0, m0
mov [r0], r3b
sub r0, r1 ; r0 = projected end of dst, assuming no more escapes
or r3d, 0xffffff00 ; ignore data before src
; Start off by jumping into the escape loop in case there's an escape at the start.
; And do a few more in scalar until dst is aligned.
jmp .escape_loop
%if mmsize == 16
NAL_LOOP .loop_aligned, mova
jmp .ret
%endif
NAL_LOOP .loop_unaligned, movu
.ret:
movifnidn rax, r0
RET
.escape:
; Skip bytes that are known to be valid
and k4, k3
tzcnt k4, k4
xor r3d, r3d ; the last two bytes are known to be zero
add r1, r4
.escape_loop:
inc r1
jge .ret
movzx r4d, byte [r1+r2]
shl r3d, 8
or r3d, r4d
test r3d, 0xfffffc ; if the last two bytes are 0 and the current byte is <=3
jz .add_escape_byte
.escaped:
lea r4d, [r0+r1]
mov [r0+r1], r3b
test r4d, mmsize-1 ; Do SIMD when dst is aligned
jnz .escape_loop
movu m1, [r1+r2+mmsize]
movu m2, [r1+r2]
%if mmsize == 16
lea r4d, [r1+r2]
test r4d, mmsize-1
jz .loop_aligned
%endif
jmp .loop_unaligned
.add_escape_byte:
mov byte [r0+r1], 3
inc r0
or r3d, 0x0300
jmp .escaped
%endmacro
INIT_MMX mmx2
NAL_ESCAPE
INIT_XMM sse2
NAL_ESCAPE
%if ARCH_X86_64
INIT_YMM avx2
NAL_ESCAPE
%endif

64
common/x86/bitstream.h Normal file
View File

@@ -0,0 +1,64 @@
/*****************************************************************************
* bitstream.h: x86 bitstream functions
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_X86_BITSTREAM_H
#define X264_X86_BITSTREAM_H
#define x264_nal_escape_mmx2 x264_template(nal_escape_mmx2)
uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
#define x264_nal_escape_sse2 x264_template(nal_escape_sse2)
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
#define x264_nal_escape_avx2 x264_template(nal_escape_avx2)
uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end );
#define x264_cabac_block_residual_rd_internal_sse2 x264_template(cabac_block_residual_rd_internal_sse2)
void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_rd_internal_lzcnt x264_template(cabac_block_residual_rd_internal_lzcnt)
void x264_cabac_block_residual_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_rd_internal_ssse3 x264_template(cabac_block_residual_rd_internal_ssse3)
void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_rd_internal_ssse3_lzcnt x264_template(cabac_block_residual_rd_internal_ssse3_lzcnt)
void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_rd_internal_avx512 x264_template(cabac_block_residual_rd_internal_avx512)
void x264_cabac_block_residual_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_8x8_rd_internal_sse2 x264_template(cabac_block_residual_8x8_rd_internal_sse2)
void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_8x8_rd_internal_lzcnt x264_template(cabac_block_residual_8x8_rd_internal_lzcnt)
void x264_cabac_block_residual_8x8_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_8x8_rd_internal_ssse3 x264_template(cabac_block_residual_8x8_rd_internal_ssse3)
void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt x264_template(cabac_block_residual_8x8_rd_internal_ssse3_lzcnt)
void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_8x8_rd_internal_avx512 x264_template(cabac_block_residual_8x8_rd_internal_avx512)
void x264_cabac_block_residual_8x8_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_internal_sse2 x264_template(cabac_block_residual_internal_sse2)
void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_internal_lzcnt x264_template(cabac_block_residual_internal_lzcnt)
void x264_cabac_block_residual_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_internal_avx2 x264_template(cabac_block_residual_internal_avx2)
void x264_cabac_block_residual_internal_avx2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#define x264_cabac_block_residual_internal_avx512 x264_template(cabac_block_residual_internal_avx512)
void x264_cabac_block_residual_internal_avx512( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
#endif

768
common/x86/cabac-a.asm Normal file
View File

@@ -0,0 +1,768 @@
;*****************************************************************************
;* cabac-a.asm: x86 cabac
;*****************************************************************************
;* Copyright (C) 2008-2025 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;* Holger Lubitz <holger@lubitz.org>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA 64
%if ARCH_X86_64
%macro COEFF_LAST_TABLE 4-18 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
%xdefine %%funccpu1 %2 ; last4
%xdefine %%funccpu2 %3 ; last64
%xdefine %%funccpu3 %4 ; last15/last16
coeff_last_%1:
%xdefine %%base coeff_last_%1
%rep 14
%ifidn %5, 4
dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu1) - %%base
%elifidn %5, 64
dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu2) - %%base
%else
dd mangle(private_prefix %+ _coeff_last%5_ %+ %%funccpu3) - %%base
%endif
%rotate 1
%endrep
dd 0, 0 ; 64-byte alignment padding
%endmacro
cextern coeff_last4_mmx2
cextern coeff_last4_lzcnt
%if HIGH_BIT_DEPTH
cextern coeff_last4_avx512
%endif
cextern coeff_last15_sse2
cextern coeff_last15_lzcnt
cextern coeff_last15_avx512
cextern coeff_last16_sse2
cextern coeff_last16_lzcnt
cextern coeff_last16_avx512
cextern coeff_last64_sse2
cextern coeff_last64_lzcnt
cextern coeff_last64_avx2
cextern coeff_last64_avx512
COEFF_LAST_TABLE sse2, mmx2, sse2, sse2
COEFF_LAST_TABLE lzcnt, lzcnt, lzcnt, lzcnt
COEFF_LAST_TABLE avx2, lzcnt, avx2, lzcnt
%if HIGH_BIT_DEPTH
COEFF_LAST_TABLE avx512, avx512, avx512, avx512
%else
COEFF_LAST_TABLE avx512, lzcnt, avx512, avx512
%endif
%endif
coeff_abs_level1_ctx: db 1, 2, 3, 4, 0, 0, 0, 0
coeff_abs_levelgt1_ctx: db 5, 5, 5, 5, 6, 7, 8, 9
coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
db 4, 4, 4, 4, 5, 6, 7, 7
SECTION .text
cextern_common cabac_range_lps
cextern_common cabac_transition
cextern_common cabac_renorm_shift
cextern_common cabac_entropy
cextern cabac_size_unary
cextern cabac_transition_unary
cextern_common significant_coeff_flag_offset
cextern_common significant_coeff_flag_offset_8x8
cextern_common last_coeff_flag_offset
cextern_common last_coeff_flag_offset_8x8
cextern_common coeff_abs_level_m1_offset
cextern_common count_cat_m1
cextern cabac_encode_ue_bypass
%if ARCH_X86_64
%define pointer resq
%else
%define pointer resd
%endif
struc cb
.low: resd 1
.range: resd 1
.queue: resd 1
.bytes_outstanding: resd 1
.start: pointer 1
.p: pointer 1
.end: pointer 1
align 64, resb 1
.bits_encoded: resd 1
.state: resb 1024
endstruc
%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
%if ARCH_X86_64 == 0
movzx %1, byte [%2+%3+%4]
%elifidn %4, 0
movzx %1, byte [%2+%3+r7-$$]
%else
lea %5, [r7+%4]
movzx %1, byte [%2+%3+%5-$$]
%endif
%endmacro
%macro CABAC 1
; t3 must be ecx, since it's used for shift.
%if WIN64
DECLARE_REG_TMP 3,1,2,0,5,6,4,4
%elif ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6,6
%else
DECLARE_REG_TMP 0,4,2,1,3,5,6,2
%endif
cglobal cabac_encode_decision_%1, 1,7
movifnidn t1d, r1m
mov t5d, [r0+cb.range]
movzx t6d, byte [r0+cb.state+t1]
movifnidn t0, r0 ; WIN64
mov t4d, ~1
mov t3d, t5d
and t4d, t6d
shr t5d, 6
movifnidn t2d, r2m
%if WIN64
PUSH r7
%endif
%if ARCH_X86_64
lea r7, [$$]
%endif
LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
and t6d, 1
sub t3d, t5d
cmp t6d, t2d
mov t6d, [t0+cb.low]
lea t2, [t6+t3]
cmovne t3d, t5d
cmovne t6d, t2d
mov [t0+cb.state+t1], t4b
;cabac_encode_renorm
mov t4d, t3d
%ifidn %1, bmi2
lzcnt t3d, t3d
sub t3d, 23
shlx t4d, t4d, t3d
shlx t6d, t6d, t3d
%else
shr t3d, 3
LOAD_GLOBAL t3d, cabac_renorm_shift, t3
shl t4d, t3b
shl t6d, t3b
%endif
%if WIN64
POP r7
%endif
mov [t0+cb.range], t4d
add t3d, [t0+cb.queue]
jge cabac_putbyte_%1
.update_queue_low:
mov [t0+cb.low], t6d
mov [t0+cb.queue], t3d
RET
cglobal cabac_encode_bypass_%1, 2,3
mov t7d, [r0+cb.low]
and r1d, [r0+cb.range]
lea t7d, [t7*2+r1]
movifnidn t0, r0 ; WIN64
mov t3d, [r0+cb.queue]
inc t3d
%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
jge cabac_putbyte_%1
%else
jge .putbyte
%endif
mov [t0+cb.low], t7d
mov [t0+cb.queue], t3d
RET
%if ARCH_X86_64 == 0
.putbyte:
PROLOGUE 0,7
movifnidn t6d, t7d
jmp cabac_putbyte_%1
%endif
%ifnidn %1,bmi2
cglobal cabac_encode_terminal_%1, 1,3
sub dword [r0+cb.range], 2
; shortcut: the renormalization shift in terminal
; can only be 0 or 1 and is zero over 99% of the time.
test dword [r0+cb.range], 0x100
je .renorm
RET
.renorm:
shl dword [r0+cb.low], 1
shl dword [r0+cb.range], 1
inc dword [r0+cb.queue]
jge .putbyte
RET
.putbyte:
PROLOGUE 0,7
movifnidn t0, r0 ; WIN64
mov t3d, [r0+cb.queue]
mov t6d, [t0+cb.low]
%endif
cabac_putbyte_%1:
; alive: t0=cb t3=queue t6=low
%if WIN64
DECLARE_REG_TMP 3,6,1,0,2,5,4
%endif
%ifidn %1, bmi2
add t3d, 10
shrx t2d, t6d, t3d
bzhi t6d, t6d, t3d
sub t3d, 18
%else
mov t1d, -1
add t3d, 10
mov t2d, t6d
shl t1d, t3b
shr t2d, t3b ; out
not t1d
sub t3d, 18
and t6d, t1d
%endif
mov t5d, [t0+cb.bytes_outstanding]
cmp t2b, 0xff ; FIXME is a 32bit op faster?
jz .postpone
mov t1, [t0+cb.p]
add [t1-1], t2h
dec t2h
.loop_outstanding:
mov [t1], t2h
inc t1
dec t5d
jge .loop_outstanding
mov [t1-1], t2b
mov [t0+cb.p], t1
.postpone:
inc t5d
mov [t0+cb.bytes_outstanding], t5d
jmp mangle(private_prefix %+ _cabac_encode_decision_%1.update_queue_low)
%endmacro
CABAC asm
CABAC bmi2
%if ARCH_X86_64
; %1 = label name
; %2 = node_ctx init?
%macro COEFF_ABS_LEVEL_GT1 2
%if %2
%define ctx 1
%else
movzx r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL]
%define ctx r11
%endif
movzx r9d, byte [r8+ctx]
; if( coeff_abs > 1 )
cmp r1d, 1
jg .%1_gt1
; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 )
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
lea r0d, [r0+r9+256]
mov [r8+ctx], r10b
%if %2
mov r2d, 1
%else
movzx r2d, byte [coeff_abs_level_transition+r2 GLOBAL]
%endif
jmp .%1_end
.%1_gt1:
; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 )
movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
xor r9d, 1
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
mov [r8+ctx], r10b
add r0d, r9d
%if %2
%define ctx 5
%else
movzx r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL]
%define ctx r11
%endif
; if( coeff_abs < 15 )
cmp r1d, 15
jge .%1_escape
shl r1d, 7
; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
movzx r9d, byte [r8+ctx]
add r9d, r1d
movzx r10d, byte [cabac_transition_unary-128+r9 GLOBAL]
; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
movzx r9d, word [cabac_size_unary-256+r9*2 GLOBAL]
mov [r8+ctx], r10b
add r0d, r9d
jmp .%1_gt1_end
.%1_escape:
; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]]
movzx r9d, byte [r8+ctx]
movzx r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL]
; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]]
movzx r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL]
add r0d, r9d
mov [r8+ctx], r10b
sub r1d, 14
%if cpuflag(lzcnt)
lzcnt r9d, r1d
xor r9d, 0x1f
%else
bsr r9d, r1d
%endif
; bs_size_ue_big(coeff_abs-15)<<8
shl r9d, 9
; (ilog2(coeff_abs-14)+1) << 8
lea r0d, [r0+r9+256]
.%1_gt1_end:
%if %2
mov r2d, 4
%else
movzx r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL]
%endif
.%1_end:
%endmacro
%macro LOAD_DCTCOEF 1
%if HIGH_BIT_DEPTH
mov %1, [dct+r6*4]
%else
movzx %1, word [dct+r6*2]
%endif
%endmacro
%macro ABS_DCTCOEFS 2
%if HIGH_BIT_DEPTH
%define %%abs ABSD
%else
%define %%abs ABSW
%endif
%if mmsize == %2*SIZEOF_DCTCOEF
%%abs m0, [%1], m1
mova [rsp], m0
%elif mmsize == %2*SIZEOF_DCTCOEF/2
%%abs m0, [%1+0*mmsize], m2
%%abs m1, [%1+1*mmsize], m3
mova [rsp+0*mmsize], m0
mova [rsp+1*mmsize], m1
%else
%assign i 0
%rep %2*SIZEOF_DCTCOEF/(4*mmsize)
%%abs m0, [%1+(4*i+0)*mmsize], m4
%%abs m1, [%1+(4*i+1)*mmsize], m5
%%abs m2, [%1+(4*i+2)*mmsize], m4
%%abs m3, [%1+(4*i+3)*mmsize], m5
mova [rsp+(4*i+0)*mmsize], m0
mova [rsp+(4*i+1)*mmsize], m1
mova [rsp+(4*i+2)*mmsize], m2
mova [rsp+(4*i+3)*mmsize], m3
%assign i i+1
%endrep
%endif
%endmacro
%macro SIG_OFFSET 1
%if %1
movzx r11d, byte [r4+r6]
%endif
%endmacro
%macro LAST_OFFSET 1
%if %1
movzx r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL]
%endif
%endmacro
%macro COEFF_LAST 2 ; table, ctx_block_cat
lea r1, [%1 GLOBAL]
movsxd r6, [r1+4*%2]
add r6, r1
call r6
%endmacro
;-----------------------------------------------------------------------------
; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced,
; int ctx_block_cat, x264_cabac_t *cb );
;-----------------------------------------------------------------------------
;%1 = 8x8 mode
%macro CABAC_RESIDUAL_RD 2
%if %1
%define func cabac_block_residual_8x8_rd_internal
%define maxcoeffs 64
%define dct rsp
%else
%define func cabac_block_residual_rd_internal
%define maxcoeffs 16
%define dct r4
%endif
cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
lea r12, [$$]
%define GLOBAL +r12-$$
shl r1d, 4 ; MB_INTERLACED*16
%if %1
lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
%endif
add r1d, r2d
movzx r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL] ; r5 = ctx_sig
movzx r7d, word [last_coeff_flag_offset+r1*2 GLOBAL] ; r7 = ctx_last
movzx r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] ; r8 = ctx_level
; abs() all the coefficients; copy them to the stack to avoid
; changing the originals.
; overreading is okay; it's all valid aligned data anyways.
%if %1
ABS_DCTCOEFS r0, 64
%else
mov r4, r0 ; r4 = dct
and r4, ~SIZEOF_DCTCOEF ; handle AC coefficient case
ABS_DCTCOEFS r4, 16
xor r4, r0 ; calculate our new dct pointer
add r4, rsp ; restore AC coefficient offset
%endif
; for improved OOE performance, run coeff_last on the original coefficients.
COEFF_LAST %2, r2 ; coeff_last[ctx_block_cat]( dct )
; we know on 64-bit that the SSE2 versions of this function only
; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
; don't need r2 in 8x8 mode.
mov r0d, [r3+cb.bits_encoded] ; r0 = cabac.f8_bits_encoded
; pre-add some values to simplify addressing
add r3, cb.state
add r5, r3
add r7, r3
add r8, r3 ; precalculate cabac state pointers
; if( last != count_cat_m1[ctx_block_cat] )
%if %1
cmp r6b, 63
%else
cmp r6b, [count_cat_m1+r2 GLOBAL]
%endif
je .skip_last_sigmap
; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last,
; so we'll use r11 for this.
%if %1
%define siglast_ctx r11
%else
%define siglast_ctx r6
%endif
; x264_cabac_encode_decision( cb, ctx_sig + last, 1 )
; x264_cabac_encode_decision( cb, ctx_last + last, 1 )
SIG_OFFSET %1
movzx r1d, byte [r5+siglast_ctx]
movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
xor r1d, 1
movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
mov [r5+siglast_ctx], r9b
add r0d, r1d
LAST_OFFSET %1
movzx r1d, byte [r7+siglast_ctx]
movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
xor r1d, 1
movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
mov [r7+siglast_ctx], r9b
add r0d, r1d
.skip_last_sigmap:
LOAD_DCTCOEF r1d
COEFF_ABS_LEVEL_GT1 last, 1
; for( int i = last-1 ; i >= 0; i-- )
dec r6d
jl .end
.coeff_loop:
LOAD_DCTCOEF r1d
; if( l[i] )
SIG_OFFSET %1
movzx r9d, byte [r5+siglast_ctx]
test r1d, r1d
jnz .coeff_nonzero
; x264_cabac_encode_decision( cb, ctx_sig + i, 0 )
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
mov [r5+siglast_ctx], r10b
add r0d, r9d
dec r6d
jge .coeff_loop
jmp .end
.coeff_nonzero:
; x264_cabac_encode_decision( cb, ctx_sig + i, 1 )
movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
xor r9d, 1
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
mov [r5+siglast_ctx], r10b
add r0d, r9d
; x264_cabac_encode_decision( cb, ctx_last + i, 0 );
LAST_OFFSET %1
movzx r9d, byte [r7+siglast_ctx]
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
mov [r7+siglast_ctx], r10b
add r0d, r9d
COEFF_ABS_LEVEL_GT1 coeff, 0
dec r6d
jge .coeff_loop
.end:
mov [r3+cb.bits_encoded-cb.state], r0d
RET
%endmacro
INIT_XMM sse2
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
INIT_XMM lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
INIT_XMM ssse3
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
INIT_XMM ssse3,lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
%if HIGH_BIT_DEPTH
INIT_ZMM avx512
%else
INIT_YMM avx512
%endif
CABAC_RESIDUAL_RD 0, coeff_last_avx512
INIT_ZMM avx512
CABAC_RESIDUAL_RD 1, coeff_last_avx512
;-----------------------------------------------------------------------------
; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
; int ctx_block_cat, x264_cabac_t *cb );
;-----------------------------------------------------------------------------
%macro CALL_CABAC 0
%if cpuflag(bmi2)
call cabac_encode_decision_bmi2
%else
call cabac_encode_decision_asm
%endif
%if WIN64 ; move cabac back
mov r0, r3
%endif
%endmacro
; %1 = 8x8 mode
; %2 = dct register
; %3 = countcat
; %4 = name
%macro SIGMAP_LOOP 3-4
.sigmap_%4loop:
%if HIGH_BIT_DEPTH
mov %2, [dct+r10*4]
%else
movsx %2, word [dct+r10*2]
%endif
%if %1
movzx r1d, byte [sigoff_8x8 + r10]
add r1d, sigoffd
%else
lea r1d, [sigoffd + r10d]
%endif
test %2, %2
jz .sigmap_%4zero ; if( l[i] )
inc coeffidxd
mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i];
mov r2d, 1
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );
%if %1
movzx r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL]
add r1d, lastoffd
%else
lea r1d, [lastoffd + r10d]
%endif
cmp r10d, lastm ; if( i == last )
je .sigmap_%4last
xor r2d, r2d
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );
jmp .sigmap_%4loop_endcheck
.sigmap_%4zero:
xor r2d, r2d
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );
.sigmap_%4loop_endcheck:
inc r10d
cmp r10d, %3
jne .sigmap_%4loop ; if( ++i == count_m1 )
%if HIGH_BIT_DEPTH
mov %2, [dct+r10*4]
%else
movsx %2, word [dct+r10*2]
%endif
inc coeffidxd
mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i]
jmp .sigmap_%4end
.sigmap_%4last: ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );
mov r2d, 1
CALL_CABAC
.sigmap_%4end:
%if %1==0
jmp .level_loop_start
%endif
%endmacro
%macro CABAC_RESIDUAL 1
cglobal cabac_block_residual_internal, 4,15,0,-4*64
; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
lea r7, [$$]
%define lastm [rsp+4*1]
%define GLOBAL +r7-$$
shl r1d, 4
%define sigoffq r8
%define sigoffd r8d
%define lastoffq r9
%define lastoffd r9d
%define leveloffq r10
%define leveloffd r10d
%define leveloffm [rsp+4*0]
%define countcatd r11d
%define sigoff_8x8 r12
%define coeffidxq r13
%define coeffidxd r13d
%define dct r14
%define coeffs rsp+4*2
lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]
add r1d, r2d
movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL]
movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL]
movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]
movzx countcatd, byte [count_cat_m1+r2 GLOBAL]
mov coeffidxd, -1
mov dct, r0
mov leveloffm, leveloffd
COEFF_LAST %1, r2
mov lastm, eax
; put cabac in r0; needed for cabac_encode_decision
mov r0, r3
xor r10d, r10d
cmp countcatd, 63
je .sigmap_8x8
SIGMAP_LOOP 0, r12d, countcatd
.sigmap_8x8:
SIGMAP_LOOP 1, r11d, 63, _8x8
.level_loop_start:
; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop.
%define nodectxq r8
%define nodectxd r8d
mov leveloffd, leveloffm
xor nodectxd, nodectxd
.level_loop:
mov r9d, [coeffs+coeffidxq*4]
mov r11d, r9d
sar r11d, 31
add r9d, r11d
movzx r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL]
xor r9d, r11d
add r1d, leveloffd
cmp r9d, 1
jg .level_gt1
xor r2d, r2d
CALL_CABAC
movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL]
jmp .level_sign
.level_gt1:
mov r2d, 1
CALL_CABAC
movzx r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL]
add r14d, leveloffd
cmp r9d, 15
mov r12d, 15
cmovl r12d, r9d
sub r12d, 2
jz .level_eq2
.level_gt1_loop:
mov r1d, r14d
mov r2d, 1
CALL_CABAC
dec r12d
jg .level_gt1_loop
cmp r9d, 15
jge .level_bypass
.level_eq2:
mov r1d, r14d
xor r2d, r2d
CALL_CABAC
jmp .level_gt1_end
.level_bypass:
lea r2d, [r9d-15]
xor r1d, r1d
push r0
; we could avoid this if we implemented it in asm, but I don't feel like that
; right now.
%if UNIX64
push r7
push r8
%else
sub rsp, 40 ; shadow space and alignment
%endif
call cabac_encode_ue_bypass
%if UNIX64
pop r8
pop r7
%else
add rsp, 40
%endif
pop r0
.level_gt1_end:
movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
.level_sign:
mov r1d, r11d
%if cpuflag(bmi2)
call cabac_encode_bypass_bmi2
%else
call cabac_encode_bypass_asm
%endif
%if WIN64
mov r0, r3
%endif
dec coeffidxd
jge .level_loop
RET
%endmacro
INIT_XMM sse2
CABAC_RESIDUAL coeff_last_sse2
INIT_XMM lzcnt
CABAC_RESIDUAL coeff_last_lzcnt
INIT_XMM avx2
CABAC_RESIDUAL coeff_last_avx2
INIT_XMM avx512
CABAC_RESIDUAL coeff_last_avx512
%endif

82
common/x86/const-a.asm Normal file
View File

@@ -0,0 +1,82 @@
;*****************************************************************************
;* const-a.asm: x86 global constants
;*****************************************************************************
;* Copyright (C) 2010-2025 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
SECTION_RODATA 32
const pb_1, times 32 db 1
const hsub_mul, times 16 db 1, -1
const pw_1, times 16 dw 1
const pw_16, times 16 dw 16
const pw_32, times 16 dw 32
const pw_512, times 16 dw 512
const pw_00ff, times 16 dw 0x00ff
const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
const pw_0to15, dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
const pd_1, times 8 dd 1
const pd_0123, dd 0,1,2,3
const pd_4567, dd 4,5,6,7
const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
const pb_unpackbd2, times 2 db 4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7
const pb_01, times 8 db 0,1
const pb_0, times 16 db 0
const pb_a1, times 16 db 0xa1
const pb_3, times 16 db 3
const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
const pw_2, times 8 dw 2
const pw_m2, times 8 dw -2
const pw_4, times 8 dw 4
const pw_8, times 8 dw 8
const pw_64, times 8 dw 64
const pw_256, times 8 dw 256
const pw_32_0, times 4 dw 32
times 4 dw 0
const pw_8000, times 8 dw 0x8000
const pw_3fff, times 8 dw 0x3fff
const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1
const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
const pd_8, times 4 dd 8
const pd_32, times 4 dd 32
const pd_1024, times 4 dd 1024
const pd_ffff, times 4 dd 0xffff
const pw_ff00, times 8 dw 0xff00
const popcnt_table
%assign x 0
%rep 256
; population count
db ((x>>0)&1)+((x>>1)&1)+((x>>2)&1)+((x>>3)&1)+((x>>4)&1)+((x>>5)&1)+((x>>6)&1)+((x>>7)&1)
%assign x x+1
%endrep
const sw_64, dd 64

107
common/x86/cpu-a.asm Normal file
View File

@@ -0,0 +1,107 @@
;*****************************************************************************
;* cpu-a.asm: x86 cpu utilities
;*****************************************************************************
;* Copyright (C) 2003-2025 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;* Loren Merritt <lorenm@u.washington.edu>
;* Fiona Glaser <fiona@x264.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
SECTION .text
;-----------------------------------------------------------------------------
; void cpu_cpuid( int op, int *eax, int *ebx, int *ecx, int *edx )
;-----------------------------------------------------------------------------
cglobal cpu_cpuid, 5,7
push rbx
push r4
push r3
push r2
push r1
mov eax, r0d
xor ecx, ecx
cpuid
pop r4
mov [r4], eax
pop r4
mov [r4], ebx
pop r4
mov [r4], ecx
pop r4
mov [r4], edx
pop rbx
RET
;-----------------------------------------------------------------------------
; uint64_t cpu_xgetbv( int xcr )
;-----------------------------------------------------------------------------
cglobal cpu_xgetbv
movifnidn ecx, r0m
xgetbv
%if ARCH_X86_64
shl rdx, 32
or rax, rdx
%endif
ret
;-----------------------------------------------------------------------------
; void cpu_emms( void )
;-----------------------------------------------------------------------------
cglobal cpu_emms
emms
ret
;-----------------------------------------------------------------------------
; void cpu_sfence( void )
;-----------------------------------------------------------------------------
cglobal cpu_sfence
sfence
ret
%if ARCH_X86_64 == 0
;-----------------------------------------------------------------------------
; int cpu_cpuid_test( void )
; return 0 if unsupported
;-----------------------------------------------------------------------------
cglobal cpu_cpuid_test
pushfd
push ebx
push ebp
push esi
push edi
pushfd
pop eax
mov ebx, eax
xor eax, 0x200000
push eax
popfd
pushfd
pop eax
xor eax, ebx
pop edi
pop esi
pop ebp
pop ebx
popfd
ret
%endif

590
common/x86/dct-32.asm Normal file
View File

@@ -0,0 +1,590 @@
;*****************************************************************************
;* dct-32.asm: x86_32 transform and zigzag
;*****************************************************************************
;* Copyright (C) 2003-2025 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
;* Laurent Aimar <fenrir@via.ecp.fr>
;* Min Chen <chenm001.163.com>
;* Christian Heine <sennindemokrit@gmx.net>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION .text
cextern pd_32
cextern pw_pixel_max
cextern pw_2
cextern pw_m2
cextern pw_32
cextern hsub_mul
%macro SPILL_SHUFFLE 3-* ; ptr, list of regs, list of memory offsets
%xdefine %%base %1
%rep %0/2
%xdefine %%tmp m%2
%rotate %0/2
mova [%%base + %2*16], %%tmp
%rotate 1-%0/2
%endrep
%endmacro
%macro UNSPILL_SHUFFLE 3-*
%xdefine %%base %1
%rep %0/2
%xdefine %%tmp m%2
%rotate %0/2
mova %%tmp, [%%base + %2*16]
%rotate 1-%0/2
%endrep
%endmacro
%macro SPILL 2+ ; assume offsets are the same as reg numbers
SPILL_SHUFFLE %1, %2, %2
%endmacro
%macro UNSPILL 2+
UNSPILL_SHUFFLE %1, %2, %2
%endmacro
; in: size, m0..m7
; out: 0,4,6 in memory at %10,%11,%12, rest in regs
%macro DCT8_1D 12
SUMSUB_BA %1, %9, %2 ; %9 = s07, %2 = d07
SUMSUB_BA %1, %8, %3 ; %8 = s16, %3 = d16
SUMSUB_BA %1, %7, %4 ; %7 = s25, %4 = d25
SUMSUB_BA %1, %6, %5 ; %6 = s34, %5 = d34
SUMSUB_BA %1, %6, %9 ; %6 = a0, %9 = a2
SUMSUB_BA %1, %7, %8 ; %7 = a1, %8 = a3
SUMSUB_BA %1, %7, %6 ; %7 = dst0, %6 = dst4
mova %10, m%7
mova %11, m%6
psra%1 m%7, m%8, 1 ; a3>>1
padd%1 m%7, m%9 ; a2 + (a3>>1)
psra%1 m%9, 1 ; a2>>1
psub%1 m%9, m%8 ; (a2>>1) - a3
mova %12, m%9
psra%1 m%6, m%4, 1
padd%1 m%6, m%4 ; d25+(d25>>1)
psub%1 m%8, m%2, m%5 ; a5 = d07-d34-(d25+(d25>>1))
psub%1 m%8, m%6
psra%1 m%6, m%3, 1
padd%1 m%6, m%3 ; d16+(d16>>1)
padd%1 m%9, m%2, m%5
psub%1 m%9, m%6 ; a6 = d07+d34-(d16+(d16>>1))
psra%1 m%6, m%2, 1
padd%1 m%6, m%2 ; d07+(d07>>1)
padd%1 m%6, m%3
padd%1 m%6, m%4 ; a4 = d16+d25+(d07+(d07>>1))
psra%1 m%2, m%5, 1
padd%1 m%2, m%5 ; d34+(d34>>1)
padd%1 m%2, m%3
psub%1 m%2, m%4 ; a7 = d16-d25+(d34+(d34>>1))
psra%1 m%5, m%2, 2
padd%1 m%5, m%6 ; a4 + (a7>>2)
psra%1 m%4, m%9, 2
padd%1 m%4, m%8 ; a5 + (a6>>2)
psra%1 m%6, 2
psra%1 m%8, 2
psub%1 m%6, m%2 ; (a4>>2) - a7
psub%1 m%9, m%8 ; a6 - (a5>>2)
SWAP %3, %5, %4, %7, %9, %6
%endmacro
; in: size, m[1,2,3,5,6,7], 0,4 in mem at %10,%11
; out: m0..m7
%macro IDCT8_1D 11
psra%1 m%2, m%4, 1
psra%1 m%6, m%8, 1
psub%1 m%2, m%8
padd%1 m%6, m%4
psra%1 m%8, m%3, 1
padd%1 m%8, m%3
padd%1 m%8, m%5
padd%1 m%8, m%7
psra%1 m%4, m%7, 1
padd%1 m%4, m%7
padd%1 m%4, m%9
psub%1 m%4, m%3
psub%1 m%3, m%5
psub%1 m%7, m%5
padd%1 m%3, m%9
psub%1 m%7, m%9
psra%1 m%5, 1
psra%1 m%9, 1
psub%1 m%3, m%5
psub%1 m%7, m%9
psra%1 m%5, m%8, 2
psra%1 m%9, m%4, 2
padd%1 m%5, m%7
padd%1 m%9, m%3
psra%1 m%7, 2
psra%1 m%3, 2
psub%1 m%8, m%7
psub%1 m%3, m%4
mova m%4, %10
mova m%7, %11
SUMSUB_BA %1, %7, %4
SUMSUB_BA %1, %6, %7
SUMSUB_BA %1, %2, %4
SUMSUB_BA %1, %8, %6
SUMSUB_BA %1, %3, %2
SUMSUB_BA %1, %9, %4
SUMSUB_BA %1, %5, %7
SWAP %2, %4
SWAP %6, %8
SWAP %2, %6, %7
SWAP %4, %9, %8
%endmacro
%if HIGH_BIT_DEPTH
%macro SUB8x8_DCT8 0
cglobal sub8x8_dct8, 3,3,8
cglobal_label .skip_prologue
LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
DCT8_1D w, 0,1,2,3,4,5,6,7, [r0],[r0+0x10],[r0+0x50]
mova m0, [r0]
mova [r0+0x30], m5
mova [r0+0x70], m7
TRANSPOSE4x4W 0,1,2,3,4
WIDEN_SXWD 0,4
WIDEN_SXWD 1,5
WIDEN_SXWD 2,6
WIDEN_SXWD 3,7
DCT8_1D d, 0,4,1,5,2,6,3,7, [r0],[r0+0x80],[r0+0xC0]
mova [r0+0x20], m4
mova [r0+0x40], m1
mova [r0+0x60], m5
mova [r0+0xA0], m6
mova [r0+0xE0], m7
mova m4, [r0+0x10]
mova m5, [r0+0x30]
mova m6, [r0+0x50]
mova m7, [r0+0x70]
TRANSPOSE4x4W 4,5,6,7,0
WIDEN_SXWD 4,0
WIDEN_SXWD 5,1
WIDEN_SXWD 6,2
WIDEN_SXWD 7,3
DCT8_1D d,4,0,5,1,6,2,7,3, [r0+0x10],[r0+0x90],[r0+0xD0]
mova [r0+0x30], m0
mova [r0+0x50], m5
mova [r0+0x70], m1
mova [r0+0xB0], m2
mova [r0+0xF0], m3
ret
%endmacro ; SUB8x8_DCT8
INIT_XMM sse2
SUB8x8_DCT8
INIT_XMM sse4
SUB8x8_DCT8
INIT_XMM avx
SUB8x8_DCT8
%macro ADD8x8_IDCT8 0
cglobal add8x8_idct8, 2,2
add r1, 128
cglobal_label .skip_prologue
UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -6,-4,-2,2,4,6
IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-128],[r1+0]
mova [r1+0], m4
TRANSPOSE4x4D 0,1,2,3,4
paddd m0, [pd_32]
mova m4, [r1+0]
SPILL_SHUFFLE r1, 0,1,2,3, -8,-6,-4,-2
TRANSPOSE4x4D 4,5,6,7,3
paddd m4, [pd_32]
SPILL_SHUFFLE r1, 4,5,6,7, 0,2,4,6
UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, -5,-3,-1,3,5,7
IDCT8_1D d,0,1,2,3,4,5,6,7,[r1-112],[r1+16]
mova [r1+16], m4
TRANSPOSE4x4D 0,1,2,3,4
mova m4, [r1+16]
mova [r1-112], m0
TRANSPOSE4x4D 4,5,6,7,0
SPILL_SHUFFLE r1, 4,5,6,7, 1,3,5,7
UNSPILL_SHUFFLE r1, 5,6,7, -6,-4,-2
IDCT8_1D d,4,5,6,7,0,1,2,3,[r1-128],[r1-112]
SPILL_SHUFFLE r1, 4,5,6,7,0,1,2,3, -8,-7,-6,-5,-4,-3,-2,-1
UNSPILL_SHUFFLE r1, 1,2,3,5,6,7, 2,4,6,3,5,7
IDCT8_1D d,0,1,2,3,4,5,6,7,[r1+0],[r1+16]
SPILL_SHUFFLE r1, 7,6,5, 7,6,5
mova m7, [pw_pixel_max]
pxor m6, m6
mova m5, [r1-128]
STORE_DIFF m5, m0, m6, m7, [r0+0*FDEC_STRIDEB]
mova m0, [r1-112]
STORE_DIFF m0, m1, m6, m7, [r0+1*FDEC_STRIDEB]
mova m0, [r1-96]
STORE_DIFF m0, m2, m6, m7, [r0+2*FDEC_STRIDEB]
mova m0, [r1-80]
STORE_DIFF m0, m3, m6, m7, [r0+3*FDEC_STRIDEB]
mova m0, [r1-64]
STORE_DIFF m0, m4, m6, m7, [r0+4*FDEC_STRIDEB]
mova m0, [r1-48]
mova m1, [r1+80]
STORE_DIFF m0, m1, m6, m7, [r0+5*FDEC_STRIDEB]
mova m0, [r1-32]
mova m1, [r1+96]
STORE_DIFF m0, m1, m6, m7, [r0+6*FDEC_STRIDEB]
mova m0, [r1-16]
mova m1, [r1+112]
STORE_DIFF m0, m1, m6, m7, [r0+7*FDEC_STRIDEB]
RET
%endmacro ; ADD8x8_IDCT8
INIT_XMM sse2
ADD8x8_IDCT8
INIT_XMM avx
ADD8x8_IDCT8
%else ; !HIGH_BIT_DEPTH
INIT_MMX
ALIGN 16
load_diff_4x8_mmx:
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2+1*FDEC_STRIDE]
LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2+3*FDEC_STRIDE]
LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+4*FDEC_STRIDE]
LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+5*FDEC_STRIDE]
movq [r0], m0
LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+6*FDEC_STRIDE]
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+7*FDEC_STRIDE]
movq m0, [r0]
ret
cglobal dct8_mmx
DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
SAVE_MM_PERMUTATION
ret
;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal sub8x8_dct8_mmx, 3,3
global sub8x8_dct8_mmx.skip_prologue
.skip_prologue:
RESET_MM_PERMUTATION
call load_diff_4x8_mmx
call dct8_mmx
UNSPILL r0, 0
TRANSPOSE4x4W 0,1,2,3,4
SPILL r0, 0,1,2,3
UNSPILL r0, 4,6
TRANSPOSE4x4W 4,5,6,7,0
SPILL r0, 4,5,6,7
RESET_MM_PERMUTATION
add r1, 4
add r2, 4
add r0, 8
call load_diff_4x8_mmx
sub r1, 4
sub r2, 4
call dct8_mmx
sub r0, 8
UNSPILL r0+8, 4,6
TRANSPOSE4x4W 4,5,6,7,0
SPILL r0+8, 4,5,6,7
UNSPILL r0+8, 0
TRANSPOSE4x4W 0,1,2,3,5
UNSPILL r0, 4,5,6,7
SPILL_SHUFFLE r0, 0,1,2,3, 4,5,6,7
movq mm4, m6 ; depends on the permutation to not produce conflicts
movq mm0, m4
movq mm1, m5
movq mm2, mm4
movq mm3, m7
RESET_MM_PERMUTATION
UNSPILL r0+8, 4,5,6,7
add r0, 8
call dct8_mmx
sub r0, 8
SPILL r0+8, 1,2,3,5,7
RESET_MM_PERMUTATION
UNSPILL r0, 0,1,2,3,4,5,6,7
call dct8_mmx
SPILL r0, 1,2,3,5,7
ret
cglobal idct8_mmx
IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
SAVE_MM_PERMUTATION
ret
%macro ADD_STORE_ROW 3
movq m1, [r0+%1*FDEC_STRIDE]
punpckhbw m2, m1, m0
punpcklbw m1, m0
paddw m1, %2
paddw m2, %3
packuswb m1, m2
movq [r0+%1*FDEC_STRIDE], m1
%endmacro
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
cglobal add8x8_idct8_mmx, 2,2
global add8x8_idct8_mmx.skip_prologue
.skip_prologue:
INIT_MMX
add word [r1], 32
UNSPILL r1, 1,2,3,5,6,7
call idct8_mmx
SPILL r1, 7
TRANSPOSE4x4W 0,1,2,3,7
SPILL r1, 0,1,2,3
UNSPILL r1, 7
TRANSPOSE4x4W 4,5,6,7,0
SPILL r1, 4,5,6,7
INIT_MMX
UNSPILL r1+8, 1,2,3,5,6,7
add r1, 8
call idct8_mmx
sub r1, 8
SPILL r1+8, 7
TRANSPOSE4x4W 0,1,2,3,7
SPILL r1+8, 0,1,2,3
UNSPILL r1+8, 7
TRANSPOSE4x4W 4,5,6,7,0
SPILL r1+8, 4,5,6,7
INIT_MMX
movq m3, [r1+0x08]
movq m0, [r1+0x40]
movq [r1+0x40], m3
movq [r1+0x08], m0
; memory layout at this time:
; A0------ A1------
; B0------ F0------
; C0------ G0------
; D0------ H0------
; E0------ E1------
; B1------ F1------
; C1------ G1------
; D1------ H1------
UNSPILL_SHUFFLE r1, 1,2,3, 5,6,7
UNSPILL r1+8, 5,6,7
add r1, 8
call idct8_mmx
sub r1, 8
psraw m0, 6
psraw m1, 6
psraw m2, 6
psraw m3, 6
psraw m4, 6
psraw m5, 6
psraw m6, 6
psraw m7, 6
movq [r1+0x08], m0 ; mm4
movq [r1+0x48], m4 ; mm5
movq [r1+0x58], m5 ; mm0
movq [r1+0x68], m6 ; mm2
movq [r1+0x78], m7 ; mm6
movq mm5, [r1+0x18]
movq mm6, [r1+0x28]
movq [r1+0x18], m1 ; mm1
movq [r1+0x28], m2 ; mm7
movq mm7, [r1+0x38]
movq [r1+0x38], m3 ; mm3
movq mm1, [r1+0x10]
movq mm2, [r1+0x20]
movq mm3, [r1+0x30]
call idct8_mmx
psraw m0, 6
psraw m1, 6
psraw m2, 6
psraw m3, 6
psraw m4, 6
psraw m5, 6
psraw m6, 6
psraw m7, 6
SPILL r1, 0,1,2
pxor m0, m0
ADD_STORE_ROW 0, [r1+0x00], [r1+0x08]
ADD_STORE_ROW 1, [r1+0x10], [r1+0x18]
ADD_STORE_ROW 2, [r1+0x20], [r1+0x28]
ADD_STORE_ROW 3, m3, [r1+0x38]
ADD_STORE_ROW 4, m4, [r1+0x48]
ADD_STORE_ROW 5, m5, [r1+0x58]
ADD_STORE_ROW 6, m6, [r1+0x68]
ADD_STORE_ROW 7, m7, [r1+0x78]
ret
%macro DCT_SUB8 0
cglobal sub8x8_dct, 3,3
add r2, 4*FDEC_STRIDE
cglobal_label .skip_prologue
%if cpuflag(ssse3)
mova m7, [hsub_mul]
%endif
LOAD_DIFF8x4 0, 1, 2, 3, 6, 7, r1, r2-4*FDEC_STRIDE
SPILL r0, 1,2
SWAP 2, 7
LOAD_DIFF8x4 4, 5, 6, 7, 1, 2, r1, r2-4*FDEC_STRIDE
UNSPILL r0, 1
SPILL r0, 7
SWAP 2, 7
UNSPILL r0, 2
DCT4_1D 0, 1, 2, 3, 7
TRANSPOSE2x4x4W 0, 1, 2, 3, 7
UNSPILL r0, 7
SPILL r0, 2
DCT4_1D 4, 5, 6, 7, 2
TRANSPOSE2x4x4W 4, 5, 6, 7, 2
UNSPILL r0, 2
SPILL r0, 6
DCT4_1D 0, 1, 2, 3, 6
UNSPILL r0, 6
STORE_DCT 0, 1, 2, 3, r0, 0
DCT4_1D 4, 5, 6, 7, 3
STORE_DCT 4, 5, 6, 7, r0, 64
ret
;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal sub8x8_dct8, 3,3
add r2, 4*FDEC_STRIDE
cglobal_label .skip_prologue
%if cpuflag(ssse3)
mova m7, [hsub_mul]
LOAD_DIFF8x4 0, 1, 2, 3, 4, 7, r1, r2-4*FDEC_STRIDE
SPILL r0, 0,1
SWAP 1, 7
LOAD_DIFF8x4 4, 5, 6, 7, 0, 1, r1, r2-4*FDEC_STRIDE
UNSPILL r0, 0,1
%else
LOAD_DIFF m0, m7, none, [r1+0*FENC_STRIDE], [r2-4*FDEC_STRIDE]
LOAD_DIFF m1, m7, none, [r1+1*FENC_STRIDE], [r2-3*FDEC_STRIDE]
LOAD_DIFF m2, m7, none, [r1+2*FENC_STRIDE], [r2-2*FDEC_STRIDE]
LOAD_DIFF m3, m7, none, [r1+3*FENC_STRIDE], [r2-1*FDEC_STRIDE]
LOAD_DIFF m4, m7, none, [r1+4*FENC_STRIDE], [r2+0*FDEC_STRIDE]
LOAD_DIFF m5, m7, none, [r1+5*FENC_STRIDE], [r2+1*FDEC_STRIDE]
SPILL r0, 0
LOAD_DIFF m6, m7, none, [r1+6*FENC_STRIDE], [r2+2*FDEC_STRIDE]
LOAD_DIFF m7, m0, none, [r1+7*FENC_STRIDE], [r2+3*FDEC_STRIDE]
UNSPILL r0, 0
%endif
DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
UNSPILL r0, 0,4
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r0+0x60],[r0+0x40],1
UNSPILL r0, 4
DCT8_1D w,0,1,2,3,4,5,6,7,[r0],[r0+0x40],[r0+0x60]
SPILL r0, 1,2,3,5,7
ret
%endmacro
INIT_XMM sse2
%define movdqa movaps
%define punpcklqdq movlhps
DCT_SUB8
%undef movdqa
%undef punpcklqdq
INIT_XMM ssse3
DCT_SUB8
INIT_XMM avx
DCT_SUB8
INIT_XMM xop
DCT_SUB8
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD8x8 0
cglobal add8x8_idct, 2,2
add r0, 4*FDEC_STRIDE
cglobal_label .skip_prologue
UNSPILL_SHUFFLE r1, 0,2,1,3, 0,1,2,3
SBUTTERFLY qdq, 0, 1, 4
SBUTTERFLY qdq, 2, 3, 4
UNSPILL_SHUFFLE r1, 4,6,5,7, 4,5,6,7
SPILL r1, 0
SBUTTERFLY qdq, 4, 5, 0
SBUTTERFLY qdq, 6, 7, 0
UNSPILL r1,0
IDCT4_1D w,0,1,2,3,r1
SPILL r1, 4
TRANSPOSE2x4x4W 0,1,2,3,4
UNSPILL r1, 4
IDCT4_1D w,4,5,6,7,r1
SPILL r1, 0
TRANSPOSE2x4x4W 4,5,6,7,0
UNSPILL r1, 0
paddw m0, [pw_32]
IDCT4_1D w,0,1,2,3,r1
paddw m4, [pw_32]
IDCT4_1D w,4,5,6,7,r1
SPILL r1, 6,7
pxor m7, m7
DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
UNSPILL_SHUFFLE r1, 0,2, 6,7
DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
STORE_IDCT m1, m3, m5, m2
ret
%endmacro ; ADD8x8
INIT_XMM sse2
ADD8x8
INIT_XMM avx
ADD8x8
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro ADD8x8_IDCT8 0
cglobal add8x8_idct8, 2,2
add r0, 4*FDEC_STRIDE
cglobal_label .skip_prologue
UNSPILL r1, 1,2,3,5,6,7
IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
SPILL r1, 6
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,[r1+0x60],[r1+0x40],1
paddw m0, [pw_32]
SPILL r1, 0
IDCT8_1D w,0,1,2,3,4,5,6,7,[r1+0],[r1+64]
SPILL r1, 6,7
pxor m7, m7
DIFFx2 m0, m1, m6, m7, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]; m5
DIFFx2 m2, m3, m6, m7, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]; m5
UNSPILL_SHUFFLE r1, 0,2, 6,7
DIFFx2 m4, m5, m6, m7, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]; m5
DIFFx2 m0, m2, m6, m7, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]; m5
STORE_IDCT m1, m3, m5, m2
ret
%endmacro ; ADD8x8_IDCT8
INIT_XMM sse2
ADD8x8_IDCT8
INIT_XMM avx
ADD8x8_IDCT8
%endif ; !HIGH_BIT_DEPTH

424
common/x86/dct-64.asm Normal file
View File

@@ -0,0 +1,424 @@
;*****************************************************************************
;* dct-64.asm: x86_64 transform and zigzag
;*****************************************************************************
;* Copyright (C) 2003-2025 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <holger@lubitz.org>
;* Laurent Aimar <fenrir@via.ecp.fr>
;* Min Chen <chenm001.163.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION .text
cextern pd_32
cextern pw_pixel_max
cextern pw_2
cextern pw_m2
cextern pw_32
cextern hsub_mul
; in: size, m0..m7, temp, temp
; out: m0..m7
%macro DCT8_1D 11
SUMSUB_BA %1, %6, %5, %11 ; %6=s34, %5=d34
SUMSUB_BA %1, %7, %4, %11 ; %7=s25, %4=d25
SUMSUB_BA %1, %8, %3, %11 ; %8=s16, %3=d16
SUMSUB_BA %1, %9, %2, %11 ; %9=s07, %2=d07
SUMSUB_BA %1, %7, %8, %11 ; %7=a1, %8=a3
SUMSUB_BA %1, %6, %9, %11 ; %6=a0, %9=a2
psra%1 m%10, m%2, 1
padd%1 m%10, m%2
padd%1 m%10, m%3
padd%1 m%10, m%4 ; %10=a4
psra%1 m%11, m%5, 1
padd%1 m%11, m%5
padd%1 m%11, m%3
psub%1 m%11, m%4 ; %11=a7
SUMSUB_BA %1, %5, %2
psub%1 m%2, m%4
psub%1 m%5, m%3
psra%1 m%4, 1
psra%1 m%3, 1
psub%1 m%2, m%4 ; %2=a5
psub%1 m%5, m%3 ; %5=a6
psra%1 m%3, m%11, 2
padd%1 m%3, m%10 ; %3=b1
psra%1 m%10, 2
psub%1 m%10, m%11 ; %10=b7
SUMSUB_BA %1, %7, %6, %11 ; %7=b0, %6=b4
psra%1 m%4, m%8, 1
padd%1 m%4, m%9 ; %4=b2
psra%1 m%9, 1
psub%1 m%9, m%8 ; %9=b6
psra%1 m%8, m%5, 2
padd%1 m%8, m%2 ; %8=b3
psra%1 m%2, 2
psub%1 m%5, m%2 ; %5=b5
SWAP %2, %7, %5, %8, %9, %10
%endmacro
%macro IDCT8_1D 11
SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2
psra%1 m%10, m%3, 1
padd%1 m%10, m%3
padd%1 m%10, m%5
padd%1 m%10, m%7 ; %9=a7
psra%1 m%11, m%4, 1
psub%1 m%11, m%8 ; %10=a4
psra%1 m%8, 1
padd%1 m%8, m%4 ; %7=a6
psra%1 m%4, m%7, 1
padd%1 m%4, m%7
padd%1 m%4, m%9
psub%1 m%4, m%3 ; %3=a5
psub%1 m%3, m%5
psub%1 m%7, m%5
padd%1 m%3, m%9
psub%1 m%7, m%9
psra%1 m%5, 1
psra%1 m%9, 1
psub%1 m%3, m%5 ; %2=a3
psub%1 m%7, m%9 ; %6=a1
psra%1 m%5, m%10, 2
padd%1 m%5, m%7 ; %4=b1
psra%1 m%7, 2
psub%1 m%10, m%7 ; %9=b7
SUMSUB_BA %1, %8, %6, %7 ; %7=b0, %5=b6
SUMSUB_BA %1, %11, %2, %7 ; %10=b2, %1=b4
psra%1 m%9, m%4, 2
padd%1 m%9, m%3 ; %8=b3
psra%1 m%3, 2
psub%1 m%3, m%4 ; %2=b5
SUMSUB_BA %1, %10, %8, %7 ; %9=c0, %7=c7
SUMSUB_BA %1, %3, %11, %7 ; %2=c1, %10=c6
SUMSUB_BA %1, %9, %2, %7 ; %8=c2, %1=c5
SUMSUB_BA %1, %5, %6, %7 ; %4=c3, %5=c4
SWAP %11, %4
SWAP %2, %10, %7
SWAP %4, %9, %8
%endmacro
%if HIGH_BIT_DEPTH
%macro SUB8x8_DCT8 0
cglobal sub8x8_dct8, 3,3,14
TAIL_CALL .skip_prologue, 0
cglobal_label .skip_prologue
LOAD_DIFF8x4 0,1,2,3, none,none, r1, r2
LOAD_DIFF8x4 4,5,6,7, none,none, r1, r2
DCT8_1D w, 0,1,2,3,4,5,6,7, 8,9
TRANSPOSE4x4W 0,1,2,3,8
WIDEN_SXWD 0,8
WIDEN_SXWD 1,9
WIDEN_SXWD 2,10
WIDEN_SXWD 3,11
DCT8_1D d, 0,8,1,9,2,10,3,11, 12,13
mova [r0+0x00], m0
mova [r0+0x20], m8
mova [r0+0x40], m1
mova [r0+0x60], m9
mova [r0+0x80], m2
mova [r0+0xA0], m10
mova [r0+0xC0], m3
mova [r0+0xE0], m11
TRANSPOSE4x4W 4,5,6,7,0
WIDEN_SXWD 4,0
WIDEN_SXWD 5,1
WIDEN_SXWD 6,2
WIDEN_SXWD 7,3
DCT8_1D d,4,0,5,1,6,2,7,3, 8,9
mova [r0+0x10], m4
mova [r0+0x30], m0
mova [r0+0x50], m5
mova [r0+0x70], m1
mova [r0+0x90], m6
mova [r0+0xB0], m2
mova [r0+0xD0], m7
mova [r0+0xF0], m3
ret
%endmacro ; SUB8x8_DCT8
INIT_XMM sse2
SUB8x8_DCT8
INIT_XMM sse4
SUB8x8_DCT8
INIT_XMM avx
SUB8x8_DCT8
%macro ADD8x8_IDCT8 0
cglobal add8x8_idct8, 2,2,16
add r1, 128
TAIL_CALL .skip_prologue, 0
cglobal_label .skip_prologue
mova m0, [r1-128]
mova m1, [r1-96]
mova m2, [r1-64]
mova m3, [r1-32]
mova m4, [r1+ 0]
mova m5, [r1+32]
mova m6, [r1+64]
mova m7, [r1+96]
IDCT8_1D d,0,1,2,3,4,5,6,7,8,9
TRANSPOSE4x4D 0,1,2,3,8
TRANSPOSE4x4D 4,5,6,7,8
paddd m0, [pd_32]
paddd m4, [pd_32]
mova [r1+64], m6
mova [r1+96], m7
mova m8, [r1-112]
mova m9, [r1-80]
mova m10, [r1-48]
mova m11, [r1-16]
mova m12, [r1+16]
mova m13, [r1+48]
mova m14, [r1+80]
mova m15, [r1+112]
IDCT8_1D d,8,9,10,11,12,13,14,15,6,7
TRANSPOSE4x4D 8,9,10,11,6
TRANSPOSE4x4D 12,13,14,15,6
IDCT8_1D d,0,1,2,3,8,9,10,11,6,7
mova [r1-112], m8
mova [r1-80], m9
mova m6, [r1+64]
mova m7, [r1+96]
IDCT8_1D d,4,5,6,7,12,13,14,15,8,9
pxor m8, m8
mova m9, [pw_pixel_max]
STORE_DIFF m0, m4, m8, m9, [r0+0*FDEC_STRIDEB]
STORE_DIFF m1, m5, m8, m9, [r0+1*FDEC_STRIDEB]
STORE_DIFF m2, m6, m8, m9, [r0+2*FDEC_STRIDEB]
STORE_DIFF m3, m7, m8, m9, [r0+3*FDEC_STRIDEB]
mova m0, [r1-112]
mova m1, [r1-80]
STORE_DIFF m0, m12, m8, m9, [r0+4*FDEC_STRIDEB]
STORE_DIFF m1, m13, m8, m9, [r0+5*FDEC_STRIDEB]
STORE_DIFF m10, m14, m8, m9, [r0+6*FDEC_STRIDEB]
STORE_DIFF m11, m15, m8, m9, [r0+7*FDEC_STRIDEB]
ret
%endmacro ; ADD8x8_IDCT8
INIT_XMM sse2
ADD8x8_IDCT8
INIT_XMM avx
ADD8x8_IDCT8
%else ; !HIGH_BIT_DEPTH
%macro DCT_SUB8 0
cglobal sub8x8_dct, 3,3,10
add r2, 4*FDEC_STRIDE
%if cpuflag(ssse3)
mova m7, [hsub_mul]
%endif
TAIL_CALL .skip_prologue, 0
cglobal_label .skip_prologue
SWAP 7, 9
LOAD_DIFF8x4 0, 1, 2, 3, 8, 9, r1, r2-4*FDEC_STRIDE
LOAD_DIFF8x4 4, 5, 6, 7, 8, 9, r1, r2-4*FDEC_STRIDE
DCT4_1D 0, 1, 2, 3, 8
TRANSPOSE2x4x4W 0, 1, 2, 3, 8
DCT4_1D 4, 5, 6, 7, 8
TRANSPOSE2x4x4W 4, 5, 6, 7, 8
DCT4_1D 0, 1, 2, 3, 8
STORE_DCT 0, 1, 2, 3, r0, 0
DCT4_1D 4, 5, 6, 7, 8
STORE_DCT 4, 5, 6, 7, r0, 64
ret
;-----------------------------------------------------------------------------
; void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
;-----------------------------------------------------------------------------
cglobal sub8x8_dct8, 3,3,11
add r2, 4*FDEC_STRIDE
%if cpuflag(ssse3)
mova m7, [hsub_mul]
%endif
TAIL_CALL .skip_prologue, 0
cglobal_label .skip_prologue
SWAP 7, 10
LOAD_DIFF8x4 0, 1, 2, 3, 4, 10, r1, r2-4*FDEC_STRIDE
LOAD_DIFF8x4 4, 5, 6, 7, 8, 10, r1, r2-4*FDEC_STRIDE
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
movdqa [r0+0x00], m0
movdqa [r0+0x10], m1
movdqa [r0+0x20], m2
movdqa [r0+0x30], m3
movdqa [r0+0x40], m4
movdqa [r0+0x50], m5
movdqa [r0+0x60], m6
movdqa [r0+0x70], m7
ret
%endmacro
INIT_XMM sse2
%define movdqa movaps
%define punpcklqdq movlhps
DCT_SUB8
%undef movdqa
%undef punpcklqdq
INIT_XMM ssse3
DCT_SUB8
INIT_XMM avx
DCT_SUB8
INIT_XMM xop
DCT_SUB8
INIT_YMM avx2
cglobal sub16x16_dct8, 3,3,10
add r0, 128
add r2, 4*FDEC_STRIDE
call .sub16x8_dct8
add r0, 256
add r1, FENC_STRIDE*8
add r2, FDEC_STRIDE*8
call .sub16x8_dct8
RET
.sub16x8_dct8:
LOAD_DIFF16x2_AVX2 0, 1, 2, 3, 0, 1
LOAD_DIFF16x2_AVX2 2, 3, 4, 5, 2, 3
LOAD_DIFF16x2_AVX2 4, 5, 6, 7, 4, 5
LOAD_DIFF16x2_AVX2 6, 7, 8, 9, 6, 7
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
DCT8_1D w, 0,1,2,3,4,5,6,7,8,9
mova [r0-0x80+0x00], xm0
vextracti128 [r0+0x00], m0, 1
mova [r0-0x80+0x10], xm1
vextracti128 [r0+0x10], m1, 1
mova [r0-0x80+0x20], xm2
vextracti128 [r0+0x20], m2, 1
mova [r0-0x80+0x30], xm3
vextracti128 [r0+0x30], m3, 1
mova [r0-0x80+0x40], xm4
vextracti128 [r0+0x40], m4, 1
mova [r0-0x80+0x50], xm5
vextracti128 [r0+0x50], m5, 1
mova [r0-0x80+0x60], xm6
vextracti128 [r0+0x60], m6, 1
mova [r0-0x80+0x70], xm7
vextracti128 [r0+0x70], m7, 1
ret
;-----------------------------------------------------------------------------
; void add8x8_idct8( uint8_t *p_dst, int16_t dct[8][8] )
;-----------------------------------------------------------------------------
%macro ADD8x8_IDCT8 0
cglobal add8x8_idct8, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
TAIL_CALL .skip_prologue, 0
cglobal_label .skip_prologue
SWAP 7, 9
movdqa m0, [r1+0x00]
movdqa m1, [r1+0x10]
movdqa m2, [r1+0x20]
movdqa m3, [r1+0x30]
movdqa m4, [r1+0x40]
movdqa m5, [r1+0x50]
movdqa m6, [r1+0x60]
movdqa m7, [r1+0x70]
IDCT8_1D w,0,1,2,3,4,5,6,7,8,10
TRANSPOSE8x8W 0,1,2,3,4,5,6,7,8
paddw m0, [pw_32] ; rounding for the >>6 at the end
IDCT8_1D w,0,1,2,3,4,5,6,7,8,10
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
STORE_IDCT m1, m3, m5, m7
ret
%endmacro ; ADD8x8_IDCT8
INIT_XMM sse2
ADD8x8_IDCT8
INIT_XMM avx
ADD8x8_IDCT8
;-----------------------------------------------------------------------------
; void add8x8_idct( uint8_t *pix, int16_t dct[4][4][4] )
;-----------------------------------------------------------------------------
%macro ADD8x8 0
cglobal add8x8_idct, 2,2,11
add r0, 4*FDEC_STRIDE
pxor m7, m7
TAIL_CALL .skip_prologue, 0
cglobal_label .skip_prologue
SWAP 7, 9
mova m0, [r1+ 0]
mova m2, [r1+16]
mova m1, [r1+32]
mova m3, [r1+48]
SBUTTERFLY qdq, 0, 1, 4
SBUTTERFLY qdq, 2, 3, 4
mova m4, [r1+64]
mova m6, [r1+80]
mova m5, [r1+96]
mova m7, [r1+112]
SBUTTERFLY qdq, 4, 5, 8
SBUTTERFLY qdq, 6, 7, 8
IDCT4_1D w,0,1,2,3,8,10
TRANSPOSE2x4x4W 0,1,2,3,8
IDCT4_1D w,4,5,6,7,8,10
TRANSPOSE2x4x4W 4,5,6,7,8
paddw m0, [pw_32]
IDCT4_1D w,0,1,2,3,8,10
paddw m4, [pw_32]
IDCT4_1D w,4,5,6,7,8,10
DIFFx2 m0, m1, m8, m9, [r0-4*FDEC_STRIDE], [r0-3*FDEC_STRIDE]
DIFFx2 m2, m3, m8, m9, [r0-2*FDEC_STRIDE], [r0-1*FDEC_STRIDE]
DIFFx2 m4, m5, m8, m9, [r0+0*FDEC_STRIDE], [r0+1*FDEC_STRIDE]
DIFFx2 m6, m7, m8, m9, [r0+2*FDEC_STRIDE], [r0+3*FDEC_STRIDE]
STORE_IDCT m1, m3, m5, m7
ret
%endmacro ; ADD8x8
INIT_XMM sse2
ADD8x8
INIT_XMM avx
ADD8x8
%endif ; !HIGH_BIT_DEPTH

2287
common/x86/dct-a.asm Normal file

File diff suppressed because it is too large Load Diff

249
common/x86/dct.h Normal file
View File

@@ -0,0 +1,249 @@
/*****************************************************************************
* dct.h: x86 transform and zigzag
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_X86_DCT_H
#define X264_X86_DCT_H
#define x264_sub4x4_dct_mmx x264_template(sub4x4_dct_mmx)
void x264_sub4x4_dct_mmx ( dctcoef dct [16], pixel *pix1, pixel *pix2 );
#define x264_sub8x8_dct_mmx x264_template(sub8x8_dct_mmx)
void x264_sub8x8_dct_mmx ( dctcoef dct[ 4][16], pixel *pix1, pixel *pix2 );
#define x264_sub16x16_dct_mmx x264_template(sub16x16_dct_mmx)
void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
#define x264_sub8x8_dct_sse2 x264_template(sub8x8_dct_sse2)
void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct_sse2 x264_template(sub16x16_dct_sse2)
void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub4x4_dct_ssse3 x264_template(sub4x4_dct_ssse3)
void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub4x4_dct_avx512 x264_template(sub4x4_dct_avx512)
void x264_sub4x4_dct_avx512 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_ssse3 x264_template(sub8x8_dct_ssse3)
void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct_ssse3 x264_template(sub16x16_dct_ssse3)
void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_avx x264_template(sub8x8_dct_avx)
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct_avx x264_template(sub16x16_dct_avx)
void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_xop x264_template(sub8x8_dct_xop)
void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct_xop x264_template(sub16x16_dct_xop)
void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_avx2 x264_template(sub8x8_dct_avx2)
void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_avx512 x264_template(sub8x8_dct_avx512)
void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct_avx2 x264_template(sub16x16_dct_avx2)
void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct_avx512 x264_template(sub16x16_dct_avx512)
void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_dc_mmx2 x264_template(sub8x8_dct_dc_mmx2)
void x264_sub8x8_dct_dc_mmx2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct_dc_sse2 x264_template(sub8x8_dct_dc_sse2)
void x264_sub8x8_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
#define x264_sub8x8_dct_dc_avx512 x264_template(sub8x8_dct_dc_avx512)
void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x16_dct_dc_sse2 x264_template(sub8x16_dct_dc_sse2)
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
#define x264_sub8x16_dct_dc_ssse3 x264_template(sub8x16_dct_dc_ssse3)
void x264_sub8x16_dct_dc_ssse3 ( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x16_dct_dc_avx x264_template(sub8x16_dct_dc_avx)
void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
#define x264_sub8x16_dct_dc_avx512 x264_template(sub8x16_dct_dc_avx512)
void x264_sub8x16_dct_dc_avx512( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
#define x264_add4x4_idct_mmx x264_template(add4x4_idct_mmx)
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
#define x264_add4x4_idct_sse2 x264_template(add4x4_idct_sse2)
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
#define x264_add4x4_idct_sse4 x264_template(add4x4_idct_sse4)
void x264_add4x4_idct_sse4 ( uint8_t *p_dst, int16_t dct [16] );
#define x264_add4x4_idct_avx x264_template(add4x4_idct_avx)
void x264_add4x4_idct_avx ( pixel *p_dst, dctcoef dct [16] );
#define x264_add8x8_idct_mmx x264_template(add8x8_idct_mmx)
void x264_add8x8_idct_mmx ( uint8_t *p_dst, int16_t dct[ 4][16] );
#define x264_add8x8_idct_dc_mmx2 x264_template(add8x8_idct_dc_mmx2)
void x264_add8x8_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [ 4] );
#define x264_add16x16_idct_mmx x264_template(add16x16_idct_mmx)
void x264_add16x16_idct_mmx ( uint8_t *p_dst, int16_t dct[16][16] );
#define x264_add16x16_idct_dc_mmx2 x264_template(add16x16_idct_dc_mmx2)
void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] );
#define x264_add8x8_idct_sse2 x264_template(add8x8_idct_sse2)
void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] );
#define x264_add8x8_idct_avx x264_template(add8x8_idct_avx)
void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] );
#define x264_add8x8_idct_avx2 x264_template(add8x8_idct_avx2)
void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] );
#define x264_add8x8_idct_avx512 x264_template(add8x8_idct_avx512)
void x264_add8x8_idct_avx512 ( uint8_t *p_dst, int16_t dct[ 4][16] );
#define x264_add16x16_idct_sse2 x264_template(add16x16_idct_sse2)
void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] );
#define x264_add16x16_idct_avx x264_template(add16x16_idct_avx)
void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] );
#define x264_add16x16_idct_avx2 x264_template(add16x16_idct_avx2)
void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] );
#define x264_add8x8_idct_dc_sse2 x264_template(add8x8_idct_dc_sse2)
void x264_add8x8_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [ 4] );
#define x264_add16x16_idct_dc_sse2 x264_template(add16x16_idct_dc_sse2)
void x264_add16x16_idct_dc_sse2 ( pixel *p_dst, dctcoef dct [16] );
#define x264_add8x8_idct_dc_ssse3 x264_template(add8x8_idct_dc_ssse3)
void x264_add8x8_idct_dc_ssse3 ( uint8_t *p_dst, int16_t dct [ 4] );
#define x264_add16x16_idct_dc_ssse3 x264_template(add16x16_idct_dc_ssse3)
void x264_add16x16_idct_dc_ssse3( uint8_t *p_dst, int16_t dct [16] );
#define x264_add8x8_idct_dc_avx x264_template(add8x8_idct_dc_avx)
void x264_add8x8_idct_dc_avx ( pixel *p_dst, dctcoef dct [ 4] );
#define x264_add16x16_idct_dc_avx x264_template(add16x16_idct_dc_avx)
void x264_add16x16_idct_dc_avx ( pixel *p_dst, dctcoef dct [16] );
#define x264_add16x16_idct_dc_avx2 x264_template(add16x16_idct_dc_avx2)
void x264_add16x16_idct_dc_avx2 ( uint8_t *p_dst, int16_t dct [16] );
#define x264_dct4x4dc_mmx2 x264_template(dct4x4dc_mmx2)
void x264_dct4x4dc_mmx2 ( int16_t d[16] );
#define x264_dct4x4dc_sse2 x264_template(dct4x4dc_sse2)
void x264_dct4x4dc_sse2 ( int32_t d[16] );
#define x264_dct4x4dc_avx x264_template(dct4x4dc_avx)
void x264_dct4x4dc_avx ( int32_t d[16] );
#define x264_idct4x4dc_mmx x264_template(idct4x4dc_mmx)
void x264_idct4x4dc_mmx ( int16_t d[16] );
#define x264_idct4x4dc_sse2 x264_template(idct4x4dc_sse2)
void x264_idct4x4dc_sse2 ( int32_t d[16] );
#define x264_idct4x4dc_avx x264_template(idct4x4dc_avx)
void x264_idct4x4dc_avx ( int32_t d[16] );
#define x264_dct2x4dc_mmx2 x264_template(dct2x4dc_mmx2)
void x264_dct2x4dc_mmx2( dctcoef dct[8], dctcoef dct4x4[8][16] );
#define x264_dct2x4dc_sse2 x264_template(dct2x4dc_sse2)
void x264_dct2x4dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16] );
#define x264_dct2x4dc_avx x264_template(dct2x4dc_avx)
void x264_dct2x4dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16] );
#define x264_sub8x8_dct8_mmx x264_template(sub8x8_dct8_mmx)
void x264_sub8x8_dct8_mmx ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct8_mmx x264_template(sub16x16_dct8_mmx)
void x264_sub16x16_dct8_mmx ( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct8_sse2 x264_template(sub8x8_dct8_sse2)
void x264_sub8x8_dct8_sse2 ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
#define x264_sub16x16_dct8_sse2 x264_template(sub16x16_dct8_sse2)
void x264_sub16x16_dct8_sse2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
#define x264_sub8x8_dct8_ssse3 x264_template(sub8x8_dct8_ssse3)
void x264_sub8x8_dct8_ssse3 ( int16_t dct [64], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub16x16_dct8_ssse3 x264_template(sub16x16_dct8_ssse3)
void x264_sub16x16_dct8_ssse3( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
#define x264_sub8x8_dct8_sse4 x264_template(sub8x8_dct8_sse4)
void x264_sub8x8_dct8_sse4 ( int32_t dct [64], uint16_t *pix1, uint16_t *pix2 );
#define x264_sub16x16_dct8_sse4 x264_template(sub16x16_dct8_sse4)
void x264_sub16x16_dct8_sse4 ( int32_t dct[4][64], uint16_t *pix1, uint16_t *pix2 );
#define x264_sub8x8_dct8_avx x264_template(sub8x8_dct8_avx)
void x264_sub8x8_dct8_avx ( dctcoef dct [64], pixel *pix1, pixel *pix2 );
#define x264_sub16x16_dct8_avx x264_template(sub16x16_dct8_avx)
void x264_sub16x16_dct8_avx ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
#define x264_sub16x16_dct8_avx2 x264_template(sub16x16_dct8_avx2)
void x264_sub16x16_dct8_avx2 ( dctcoef dct[4][64], pixel *pix1, pixel *pix2 );
#define x264_add8x8_idct8_mmx x264_template(add8x8_idct8_mmx)
void x264_add8x8_idct8_mmx ( uint8_t *dst, int16_t dct [64] );
#define x264_add16x16_idct8_mmx x264_template(add16x16_idct8_mmx)
void x264_add16x16_idct8_mmx ( uint8_t *dst, int16_t dct[4][64] );
#define x264_add8x8_idct8_sse2 x264_template(add8x8_idct8_sse2)
void x264_add8x8_idct8_sse2 ( pixel *dst, dctcoef dct [64] );
#define x264_add16x16_idct8_sse2 x264_template(add16x16_idct8_sse2)
void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] );
#define x264_add8x8_idct8_avx x264_template(add8x8_idct8_avx)
void x264_add8x8_idct8_avx ( pixel *dst, dctcoef dct [64] );
#define x264_add16x16_idct8_avx x264_template(add16x16_idct8_avx)
void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] );
#define x264_zigzag_scan_8x8_frame_mmx2 x264_template(zigzag_scan_8x8_frame_mmx2)
void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
#define x264_zigzag_scan_8x8_frame_sse2 x264_template(zigzag_scan_8x8_frame_sse2)
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
#define x264_zigzag_scan_8x8_frame_ssse3 x264_template(zigzag_scan_8x8_frame_ssse3)
void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
#define x264_zigzag_scan_8x8_frame_avx x264_template(zigzag_scan_8x8_frame_avx)
void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
#define x264_zigzag_scan_8x8_frame_xop x264_template(zigzag_scan_8x8_frame_xop)
void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] );
#define x264_zigzag_scan_8x8_frame_avx512 x264_template(zigzag_scan_8x8_frame_avx512)
void x264_zigzag_scan_8x8_frame_avx512( dctcoef level[64], dctcoef dct[64] );
#define x264_zigzag_scan_4x4_frame_mmx x264_template(zigzag_scan_4x4_frame_mmx)
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
#define x264_zigzag_scan_4x4_frame_sse2 x264_template(zigzag_scan_4x4_frame_sse2)
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
#define x264_zigzag_scan_4x4_frame_ssse3 x264_template(zigzag_scan_4x4_frame_ssse3)
void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
#define x264_zigzag_scan_4x4_frame_avx x264_template(zigzag_scan_4x4_frame_avx)
void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
#define x264_zigzag_scan_4x4_frame_xop x264_template(zigzag_scan_4x4_frame_xop)
void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
#define x264_zigzag_scan_4x4_frame_avx512 x264_template(zigzag_scan_4x4_frame_avx512)
void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] );
#define x264_zigzag_scan_4x4_field_sse x264_template(zigzag_scan_4x4_field_sse)
void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] );
#define x264_zigzag_scan_4x4_field_sse2 x264_template(zigzag_scan_4x4_field_sse2)
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
#define x264_zigzag_scan_4x4_field_avx512 x264_template(zigzag_scan_4x4_field_avx512)
void x264_zigzag_scan_4x4_field_avx512( dctcoef level[16], dctcoef dct[16] );
#define x264_zigzag_scan_8x8_field_mmx2 x264_template(zigzag_scan_8x8_field_mmx2)
void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
#define x264_zigzag_scan_8x8_field_sse4 x264_template(zigzag_scan_8x8_field_sse4)
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
#define x264_zigzag_scan_8x8_field_avx x264_template(zigzag_scan_8x8_field_avx)
void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
#define x264_zigzag_scan_8x8_field_xop x264_template(zigzag_scan_8x8_field_xop)
void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
#define x264_zigzag_scan_8x8_field_avx512 x264_template(zigzag_scan_8x8_field_avx512)
void x264_zigzag_scan_8x8_field_avx512( dctcoef level[64], dctcoef dct[64] );
#define x264_zigzag_sub_4x4_frame_avx x264_template(zigzag_sub_4x4_frame_avx)
int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
#define x264_zigzag_sub_4x4_frame_ssse3 x264_template(zigzag_sub_4x4_frame_ssse3)
int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
#define x264_zigzag_sub_4x4ac_frame_avx x264_template(zigzag_sub_4x4ac_frame_avx)
int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
#define x264_zigzag_sub_4x4ac_frame_ssse3 x264_template(zigzag_sub_4x4ac_frame_ssse3)
int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
#define x264_zigzag_sub_4x4_field_avx x264_template(zigzag_sub_4x4_field_avx)
int x264_zigzag_sub_4x4_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
#define x264_zigzag_sub_4x4_field_ssse3 x264_template(zigzag_sub_4x4_field_ssse3)
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
#define x264_zigzag_sub_4x4ac_field_avx x264_template(zigzag_sub_4x4ac_field_avx)
int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
#define x264_zigzag_sub_4x4ac_field_ssse3 x264_template(zigzag_sub_4x4ac_field_ssse3)
int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
#define x264_zigzag_interleave_8x8_cavlc_mmx x264_template(zigzag_interleave_8x8_cavlc_mmx)
void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
#define x264_zigzag_interleave_8x8_cavlc_sse2 x264_template(zigzag_interleave_8x8_cavlc_sse2)
void x264_zigzag_interleave_8x8_cavlc_sse2 ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#define x264_zigzag_interleave_8x8_cavlc_avx x264_template(zigzag_interleave_8x8_cavlc_avx)
void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#define x264_zigzag_interleave_8x8_cavlc_avx2 x264_template(zigzag_interleave_8x8_cavlc_avx2)
void x264_zigzag_interleave_8x8_cavlc_avx2 ( int16_t *dst, int16_t *src, uint8_t *nnz );
#define x264_zigzag_interleave_8x8_cavlc_avx512 x264_template(zigzag_interleave_8x8_cavlc_avx512)
void x264_zigzag_interleave_8x8_cavlc_avx512( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif

2548
common/x86/deblock-a.asm Normal file

File diff suppressed because it is too large Load Diff

146
common/x86/deblock.h Normal file
View File

@@ -0,0 +1,146 @@
/*****************************************************************************
* deblock.h: x86 deblocking
*****************************************************************************
* Copyright (C) 2017-2025 x264 project
*
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_X86_DEBLOCK_H
#define X264_X86_DEBLOCK_H
#define x264_deblock_v_luma_sse2 x264_template(deblock_v_luma_sse2)
void x264_deblock_v_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_luma_avx x264_template(deblock_v_luma_avx)
void x264_deblock_v_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_luma_sse2 x264_template(deblock_h_luma_sse2)
void x264_deblock_h_luma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_luma_avx x264_template(deblock_h_luma_avx)
void x264_deblock_h_luma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_chroma_sse2 x264_template(deblock_v_chroma_sse2)
void x264_deblock_v_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_chroma_avx x264_template(deblock_v_chroma_avx)
void x264_deblock_v_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_sse2 x264_template(deblock_h_chroma_sse2)
void x264_deblock_h_chroma_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_avx x264_template(deblock_h_chroma_avx)
void x264_deblock_h_chroma_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_mbaff_sse2 x264_template(deblock_h_chroma_mbaff_sse2)
void x264_deblock_h_chroma_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_mbaff_avx x264_template(deblock_h_chroma_mbaff_avx)
void x264_deblock_h_chroma_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_422_mmx2 x264_template(deblock_h_chroma_422_mmx2)
void x264_deblock_h_chroma_422_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_422_sse2 x264_template(deblock_h_chroma_422_sse2)
void x264_deblock_h_chroma_422_sse2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_422_avx x264_template(deblock_h_chroma_422_avx)
void x264_deblock_h_chroma_422_avx ( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_luma_intra_sse2 x264_template(deblock_v_luma_intra_sse2)
void x264_deblock_v_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_luma_intra_avx x264_template(deblock_v_luma_intra_avx)
void x264_deblock_v_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_luma_intra_sse2 x264_template(deblock_h_luma_intra_sse2)
void x264_deblock_h_luma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_luma_intra_avx x264_template(deblock_h_luma_intra_avx)
void x264_deblock_h_luma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_intra_sse2 x264_template(deblock_v_chroma_intra_sse2)
void x264_deblock_v_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_intra_avx x264_template(deblock_v_chroma_intra_avx)
void x264_deblock_v_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_intra_sse2 x264_template(deblock_h_chroma_intra_sse2)
void x264_deblock_h_chroma_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_intra_avx x264_template(deblock_h_chroma_intra_avx)
void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_422_intra_mmx2 x264_template(deblock_h_chroma_422_intra_mmx2)
void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_422_intra_sse2 x264_template(deblock_h_chroma_422_intra_sse2)
void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_422_intra_avx x264_template(deblock_h_chroma_422_intra_avx)
void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_strength_sse2 x264_template(deblock_strength_sse2)
void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#define x264_deblock_strength_ssse3 x264_template(deblock_strength_ssse3)
void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#define x264_deblock_strength_avx x264_template(deblock_strength_avx)
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#define x264_deblock_strength_avx2 x264_template(deblock_strength_avx2)
void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#define x264_deblock_strength_avx512 x264_template(deblock_strength_avx512)
void x264_deblock_strength_avx512( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#define x264_deblock_h_chroma_intra_mbaff_mmx2 x264_template(deblock_h_chroma_intra_mbaff_mmx2)
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_intra_mbaff_sse2 x264_template(deblock_h_chroma_intra_mbaff_sse2)
void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_intra_mbaff_avx x264_template(deblock_h_chroma_intra_mbaff_avx)
void x264_deblock_h_chroma_intra_mbaff_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
#if ARCH_X86
#define x264_deblock_h_luma_mmx2 x264_template(deblock_h_luma_mmx2)
void x264_deblock_h_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v8_luma_mmx2 x264_template(deblock_v8_luma_mmx2)
void x264_deblock_v8_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_v_chroma_mmx2 x264_template(deblock_v_chroma_mmx2)
void x264_deblock_v_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_mmx2 x264_template(deblock_h_chroma_mmx2)
void x264_deblock_h_chroma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_chroma_mbaff_mmx2 x264_template(deblock_h_chroma_mbaff_mmx2)
void x264_deblock_h_chroma_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#define x264_deblock_h_luma_intra_mmx2 x264_template(deblock_h_luma_intra_mmx2)
void x264_deblock_h_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v8_luma_intra_mmx2 x264_template(deblock_v8_luma_intra_mmx2)
void x264_deblock_v8_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_intra_mmx2 x264_template(deblock_v_chroma_intra_mmx2)
void x264_deblock_v_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_h_chroma_intra_mmx2 x264_template(deblock_h_chroma_intra_mmx2)
void x264_deblock_h_chroma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_chroma_intra_mbaff_mmx2 x264_template(deblock_v_chroma_intra_mbaff_mmx2)
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
#define x264_deblock_v_luma_mmx2 x264_template(deblock_v_luma_mmx2)
#define x264_deblock_v_luma_intra_mmx2 x264_template(deblock_v_luma_intra_mmx2)
#if HIGH_BIT_DEPTH
void x264_deblock_v_luma_mmx2( pixel *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
#else
// FIXME this wrapper has a significant cpu cost
static ALWAYS_INLINE void x264_deblock_v_luma_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
{
x264_deblock_v8_luma_mmx2( pix, stride, alpha, beta, tc0 );
x264_deblock_v8_luma_mmx2( pix+8, stride, alpha, beta, tc0+2 );
}
static ALWAYS_INLINE void x264_deblock_v_luma_intra_mmx2( uint8_t *pix, intptr_t stride, int alpha, int beta )
{
x264_deblock_v8_luma_intra_mmx2( pix, stride, alpha, beta );
x264_deblock_v8_luma_intra_mmx2( pix+8, stride, alpha, beta );
}
#endif // HIGH_BIT_DEPTH
#endif
#endif

2226
common/x86/mc-a.asm Normal file

File diff suppressed because it is too large Load Diff

2883
common/x86/mc-a2.asm Normal file

File diff suppressed because it is too large Load Diff

1143
common/x86/mc-c.c Normal file

File diff suppressed because it is too large Load Diff

33
common/x86/mc.h Normal file
View File

@@ -0,0 +1,33 @@
/*****************************************************************************
* mc.h: x86 motion compensation
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_X86_MC_H
#define X264_X86_MC_H
#define x264_mc_init_mmx x264_template(mc_init_mmx)
void x264_mc_init_mmx( uint32_t cpu, x264_mc_functions_t *pf );
#endif

423
common/x86/pixel-32.asm Normal file
View File

@@ -0,0 +1,423 @@
;*****************************************************************************
;* pixel-32.asm: x86_32 pixel metrics
;*****************************************************************************
;* Copyright (C) 2003-2025 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Laurent Aimar <fenrir@via.ecp.fr>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
cextern pw_ppmmppmm
cextern pw_pmpmpmpm
SECTION .text
INIT_MMX mmx2
%if HIGH_BIT_DEPTH == 0
%macro LOAD_DIFF_4x8P 1 ; dx
LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1]
LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3]
LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5]
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1]
LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3]
LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
movq [spill], m5
LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5]
movq m5, [spill]
%endmacro
%macro SUM4x8_MM 0
movq [spill], m6
movq [spill+8], m7
ABSW2 m0, m1, m0, m1, m6, m7
ABSW2 m2, m3, m2, m3, m6, m7
paddw m0, m2
paddw m1, m3
movq m6, [spill]
movq m7, [spill+8]
ABSW2 m4, m5, m4, m5, m2, m3
ABSW2 m6, m7, m6, m7, m2, m3
paddw m4, m6
paddw m5, m7
paddw m0, m4
paddw m1, m5
paddw m0, m1
%endmacro
;-----------------------------------------------------------------------------
; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sa8d_8x8_internal
push r0
push r2
sub esp, 0x74
%define args esp+0x74
%define spill esp+0x60 ; +16
%define trans esp+0 ; +96
LOAD_DIFF_4x8P 0
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m1
TRANSPOSE4x4W 4, 5, 6, 7, 1
movq [trans+0x00], m4
movq [trans+0x08], m5
movq [trans+0x10], m6
movq [trans+0x18], m7
movq m1, [spill]
TRANSPOSE4x4W 0, 1, 2, 3, 4
movq [trans+0x20], m0
movq [trans+0x28], m1
movq [trans+0x30], m2
movq [trans+0x38], m3
mov r0, [args+4]
mov r2, [args]
LOAD_DIFF_4x8P 4
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
movq [trans+0x40], m0
movq [trans+0x48], m1
movq [trans+0x50], m2
movq [trans+0x58], m3
movq m7, [spill]
TRANSPOSE4x4W 4, 5, 6, 7, 1
movq m0, [trans+0x00]
movq m1, [trans+0x08]
movq m2, [trans+0x10]
movq m3, [trans+0x18]
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
SUM4x8_MM
movq [trans], m0
movq m0, [trans+0x20]
movq m1, [trans+0x28]
movq m2, [trans+0x30]
movq m3, [trans+0x38]
movq m4, [trans+0x40]
movq m5, [trans+0x48]
movq m6, [trans+0x50]
movq m7, [trans+0x58]
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
SUM4x8_MM
pavgw m0, [trans]
add esp, 0x7c
ret
%undef args
%undef spill
%undef trans
%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
pxor %7, %7
pshufw %4, %1, q1032
pshufw %5, %2, q1032
pshufw %6, %3, q1032
paddusw %1, %4
paddusw %2, %5
paddusw %3, %6
punpcklwd %1, %7
punpcklwd %2, %7
punpcklwd %3, %7
pshufw %4, %1, q1032
pshufw %5, %2, q1032
pshufw %6, %3, q1032
%8 %1, %4
%8 %2, %5
%8 %3, %6
%endmacro
%macro LOAD_4x8P 1 ; dx
pxor m7, m7
movd m6, [r0+%1+7*FENC_STRIDE]
movd m0, [r0+%1+0*FENC_STRIDE]
movd m1, [r0+%1+1*FENC_STRIDE]
movd m2, [r0+%1+2*FENC_STRIDE]
movd m3, [r0+%1+3*FENC_STRIDE]
movd m4, [r0+%1+4*FENC_STRIDE]
movd m5, [r0+%1+5*FENC_STRIDE]
punpcklbw m6, m7
punpcklbw m0, m7
punpcklbw m1, m7
movq [spill], m6
punpcklbw m2, m7
punpcklbw m3, m7
movd m6, [r0+%1+6*FENC_STRIDE]
punpcklbw m4, m7
punpcklbw m5, m7
punpcklbw m6, m7
movq m7, [spill]
%endmacro
%macro HSUMSUB2 4
pshufw m4, %1, %3
pshufw m5, %2, %3
pmullw %1, %4
pmullw m5, %4
paddw %1, m4
paddw %2, m5
%endmacro
;-----------------------------------------------------------------------------
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
;-----------------------------------------------------------------------------
cglobal intra_sa8d_x3_8x8, 2,3
SUB esp, 0x94
%define edge esp+0x70 ; +32
%define spill esp+0x60 ; +16
%define trans esp+0 ; +96
%define sum esp+0 ; +32
pxor m7, m7
movq m0, [r1+7]
movq m2, [r1+16]
movq m1, m0
movq m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
movq m6, [pw_ppmmppmm]
HSUMSUB2 m0, m2, q1032, m6
HSUMSUB2 m1, m3, q1032, m6
movq m6, [pw_pmpmpmpm]
HSUMSUB2 m0, m2, q2301, m6
HSUMSUB2 m1, m3, q2301, m6
movq m4, m0
movq m5, m2
paddw m0, m1
paddw m2, m3
psubw m4, m1
psubw m3, m5
movq [edge+0], m0
movq [edge+8], m4
movq [edge+16], m2
movq [edge+24], m3
LOAD_4x8P 0
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m0
TRANSPOSE4x4W 4, 5, 6, 7, 0
movq [trans+0x00], m4
movq [trans+0x08], m5
movq [trans+0x10], m6
movq [trans+0x18], m7
movq m0, [spill]
TRANSPOSE4x4W 0, 1, 2, 3, 4
movq [trans+0x20], m0
movq [trans+0x28], m1
movq [trans+0x30], m2
movq [trans+0x38], m3
LOAD_4x8P 4
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
movq [trans+0x40], m0
movq [trans+0x48], m1
movq [trans+0x50], m2
movq [trans+0x58], m3
movq m7, [spill]
TRANSPOSE4x4W 4, 5, 6, 7, 0
movq m0, [trans+0x00]
movq m1, [trans+0x08]
movq m2, [trans+0x10]
movq m3, [trans+0x18]
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movq [spill+0], m0
movq [spill+8], m1
ABSW2 m2, m3, m2, m3, m0, m1
ABSW2 m4, m5, m4, m5, m0, m1
paddw m2, m4
paddw m3, m5
ABSW2 m6, m7, m6, m7, m4, m5
movq m0, [spill+0]
movq m1, [spill+8]
paddw m2, m6
paddw m3, m7
paddw m2, m3
ABSW m1, m1, m4
paddw m2, m1 ; 7x4 sum
movq m7, m0
movq m1, [edge+8] ; left bottom
psllw m1, 3
psubw m7, m1
ABSW2 m0, m7, m0, m7, m5, m3
paddw m0, m2
paddw m7, m2
movq [sum+0], m0 ; dc
movq [sum+8], m7 ; left
movq m0, [trans+0x20]
movq m1, [trans+0x28]
movq m2, [trans+0x30]
movq m3, [trans+0x38]
movq m4, [trans+0x40]
movq m5, [trans+0x48]
movq m6, [trans+0x50]
movq m7, [trans+0x58]
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
movd [sum+0x10], m0
movd [sum+0x12], m1
movd [sum+0x14], m2
movd [sum+0x16], m3
movd [sum+0x18], m4
movd [sum+0x1a], m5
movd [sum+0x1c], m6
movd [sum+0x1e], m7
movq [spill], m0
movq [spill+8], m1
ABSW2 m2, m3, m2, m3, m0, m1
ABSW2 m4, m5, m4, m5, m0, m1
paddw m2, m4
paddw m3, m5
paddw m2, m3
movq m0, [spill]
movq m1, [spill+8]
ABSW2 m6, m7, m6, m7, m4, m5
ABSW m1, m1, m3
paddw m2, m7
paddw m1, m6
paddw m2, m1 ; 7x4 sum
movq m1, m0
movq m7, [edge+0]
psllw m7, 3 ; left top
mov r2, [edge+0]
add r2, [edge+16]
lea r2, [4*r2+32]
and r2, 0xffc0
movd m6, r2 ; dc
psubw m1, m7
psubw m0, m6
ABSW2 m0, m1, m0, m1, m5, m6
movq m3, [sum+0] ; dc
paddw m0, m2
paddw m1, m2
movq m2, m0
paddw m0, m3
paddw m1, [sum+8] ; h
psrlq m2, 16
paddw m2, m3
movq m3, [edge+16] ; top left
movq m4, [edge+24] ; top right
psllw m3, 3
psllw m4, 3
psubw m3, [sum+16]
psubw m4, [sum+24]
ABSW2 m3, m4, m3, m4, m5, m6
paddw m2, m3
paddw m2, m4 ; v
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
mov r2, r2m
pxor m7, m7
punpckldq m2, m1
pavgw m0, m7
pavgw m2, m7
movd [r2+8], m0 ; dc
movq [r2+0], m2 ; v, h
ADD esp, 0x94
RET
%undef edge
%undef spill
%undef trans
%undef sum
;-----------------------------------------------------------------------------
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
cglobal pixel_ssim_4x4x2_core, 0,5
mov r1, r1m
mov r3, r3m
mov r4, 4
pxor m0, m0
.loop:
mov r0, r0m
mov r2, r2m
add r0, r4
add r2, r4
pxor m1, m1
pxor m2, m2
pxor m3, m3
pxor m4, m4
%rep 4
movd m5, [r0]
movd m6, [r2]
punpcklbw m5, m0
punpcklbw m6, m0
paddw m1, m5
paddw m2, m6
movq m7, m5
pmaddwd m5, m5
pmaddwd m7, m6
pmaddwd m6, m6
paddd m3, m5
paddd m4, m7
paddd m3, m6
add r0, r1
add r2, r3
%endrep
mov r0, r4m
lea r0, [r0+r4*4]
pshufw m5, m1, q0032
pshufw m6, m2, q0032
paddusw m1, m5
paddusw m2, m6
punpcklwd m1, m2
pshufw m2, m1, q0032
pshufw m5, m3, q0032
pshufw m6, m4, q0032
paddusw m1, m2
paddd m3, m5
paddd m4, m6
punpcklwd m1, m0
punpckldq m3, m4
movq [r0+0], m1
movq [r0+8], m3
sub r4, 4
jge .loop
emms
RET
%endif ; !HIGH_BIT_DEPTH

5851
common/x86/pixel-a.asm Normal file

File diff suppressed because it is too large Load Diff

623
common/x86/pixel.h Normal file
View File

@@ -0,0 +1,623 @@
/*****************************************************************************
* pixel.h: x86 pixel metrics
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_X86_PIXEL_H
#define X264_X86_PIXEL_H
#define x264_pixel_ads1_avx x264_template(pixel_ads1_avx)
#define x264_pixel_ads1_avx2 x264_template(pixel_ads1_avx2)
#define x264_pixel_ads1_mmx2 x264_template(pixel_ads1_mmx2)
#define x264_pixel_ads1_sse2 x264_template(pixel_ads1_sse2)
#define x264_pixel_ads1_ssse3 x264_template(pixel_ads1_ssse3)
#define x264_pixel_ads2_avx x264_template(pixel_ads2_avx)
#define x264_pixel_ads2_avx2 x264_template(pixel_ads2_avx2)
#define x264_pixel_ads2_mmx2 x264_template(pixel_ads2_mmx2)
#define x264_pixel_ads2_sse2 x264_template(pixel_ads2_sse2)
#define x264_pixel_ads2_ssse3 x264_template(pixel_ads2_ssse3)
#define x264_pixel_ads4_avx x264_template(pixel_ads4_avx)
#define x264_pixel_ads4_avx2 x264_template(pixel_ads4_avx2)
#define x264_pixel_ads4_mmx2 x264_template(pixel_ads4_mmx2)
#define x264_pixel_ads4_sse2 x264_template(pixel_ads4_sse2)
#define x264_pixel_ads4_ssse3 x264_template(pixel_ads4_ssse3)
#define x264_pixel_hadamard_ac_16x16_avx x264_template(pixel_hadamard_ac_16x16_avx)
#define x264_pixel_hadamard_ac_16x16_avx2 x264_template(pixel_hadamard_ac_16x16_avx2)
#define x264_pixel_hadamard_ac_16x16_mmx2 x264_template(pixel_hadamard_ac_16x16_mmx2)
#define x264_pixel_hadamard_ac_16x16_sse2 x264_template(pixel_hadamard_ac_16x16_sse2)
#define x264_pixel_hadamard_ac_16x16_sse4 x264_template(pixel_hadamard_ac_16x16_sse4)
#define x264_pixel_hadamard_ac_16x16_ssse3 x264_template(pixel_hadamard_ac_16x16_ssse3)
#define x264_pixel_hadamard_ac_16x16_ssse3_atom x264_template(pixel_hadamard_ac_16x16_ssse3_atom)
#define x264_pixel_hadamard_ac_16x16_xop x264_template(pixel_hadamard_ac_16x16_xop)
#define x264_pixel_hadamard_ac_16x8_avx x264_template(pixel_hadamard_ac_16x8_avx)
#define x264_pixel_hadamard_ac_16x8_avx2 x264_template(pixel_hadamard_ac_16x8_avx2)
#define x264_pixel_hadamard_ac_16x8_mmx2 x264_template(pixel_hadamard_ac_16x8_mmx2)
#define x264_pixel_hadamard_ac_16x8_sse2 x264_template(pixel_hadamard_ac_16x8_sse2)
#define x264_pixel_hadamard_ac_16x8_sse4 x264_template(pixel_hadamard_ac_16x8_sse4)
#define x264_pixel_hadamard_ac_16x8_ssse3 x264_template(pixel_hadamard_ac_16x8_ssse3)
#define x264_pixel_hadamard_ac_16x8_ssse3_atom x264_template(pixel_hadamard_ac_16x8_ssse3_atom)
#define x264_pixel_hadamard_ac_16x8_xop x264_template(pixel_hadamard_ac_16x8_xop)
#define x264_pixel_hadamard_ac_8x16_avx x264_template(pixel_hadamard_ac_8x16_avx)
#define x264_pixel_hadamard_ac_8x16_mmx2 x264_template(pixel_hadamard_ac_8x16_mmx2)
#define x264_pixel_hadamard_ac_8x16_sse2 x264_template(pixel_hadamard_ac_8x16_sse2)
#define x264_pixel_hadamard_ac_8x16_sse4 x264_template(pixel_hadamard_ac_8x16_sse4)
#define x264_pixel_hadamard_ac_8x16_ssse3 x264_template(pixel_hadamard_ac_8x16_ssse3)
#define x264_pixel_hadamard_ac_8x16_ssse3_atom x264_template(pixel_hadamard_ac_8x16_ssse3_atom)
#define x264_pixel_hadamard_ac_8x16_xop x264_template(pixel_hadamard_ac_8x16_xop)
#define x264_pixel_hadamard_ac_8x8_avx x264_template(pixel_hadamard_ac_8x8_avx)
#define x264_pixel_hadamard_ac_8x8_mmx2 x264_template(pixel_hadamard_ac_8x8_mmx2)
#define x264_pixel_hadamard_ac_8x8_sse2 x264_template(pixel_hadamard_ac_8x8_sse2)
#define x264_pixel_hadamard_ac_8x8_sse4 x264_template(pixel_hadamard_ac_8x8_sse4)
#define x264_pixel_hadamard_ac_8x8_ssse3 x264_template(pixel_hadamard_ac_8x8_ssse3)
#define x264_pixel_hadamard_ac_8x8_ssse3_atom x264_template(pixel_hadamard_ac_8x8_ssse3_atom)
#define x264_pixel_hadamard_ac_8x8_xop x264_template(pixel_hadamard_ac_8x8_xop)
#define x264_pixel_sa8d_16x16_mmx2 x264_template(pixel_sa8d_16x16_mmx2)
#define x264_pixel_sa8d_16x16_avx x264_template(pixel_sa8d_16x16_avx)
#define x264_pixel_sa8d_16x16_sse2 x264_template(pixel_sa8d_16x16_sse2)
#define x264_pixel_sa8d_16x16_sse4 x264_template(pixel_sa8d_16x16_sse4)
#define x264_pixel_sa8d_16x16_ssse3 x264_template(pixel_sa8d_16x16_ssse3)
#define x264_pixel_sa8d_16x16_ssse3_atom x264_template(pixel_sa8d_16x16_ssse3_atom)
#define x264_pixel_sa8d_16x16_xop x264_template(pixel_sa8d_16x16_xop)
#define x264_pixel_sa8d_8x8_mmx2 x264_template(pixel_sa8d_8x8_mmx2)
#define x264_pixel_sa8d_8x8_avx x264_template(pixel_sa8d_8x8_avx)
#define x264_pixel_sa8d_8x8_avx2 x264_template(pixel_sa8d_8x8_avx2)
#define x264_pixel_sa8d_8x8_avx512 x264_template(pixel_sa8d_8x8_avx512)
#define x264_pixel_sa8d_8x8_sse2 x264_template(pixel_sa8d_8x8_sse2)
#define x264_pixel_sa8d_8x8_sse4 x264_template(pixel_sa8d_8x8_sse4)
#define x264_pixel_sa8d_8x8_ssse3 x264_template(pixel_sa8d_8x8_ssse3)
#define x264_pixel_sa8d_8x8_ssse3_atom x264_template(pixel_sa8d_8x8_ssse3_atom)
#define x264_pixel_sa8d_8x8_xop x264_template(pixel_sa8d_8x8_xop)
#define x264_pixel_sad_16x16_avx2 x264_template(pixel_sad_16x16_avx2)
#define x264_pixel_sad_16x16_avx512 x264_template(pixel_sad_16x16_avx512)
#define x264_pixel_sad_16x16_cache32_mmx2 x264_template(pixel_sad_16x16_cache32_mmx2)
#define x264_pixel_sad_16x16_cache64_mmx2 x264_template(pixel_sad_16x16_cache64_mmx2)
#define x264_pixel_sad_16x16_cache64_sse2 x264_template(pixel_sad_16x16_cache64_sse2)
#define x264_pixel_sad_16x16_cache64_ssse3 x264_template(pixel_sad_16x16_cache64_ssse3)
#define x264_pixel_sad_16x16_mmx2 x264_template(pixel_sad_16x16_mmx2)
#define x264_pixel_sad_16x16_sse2 x264_template(pixel_sad_16x16_sse2)
#define x264_pixel_sad_16x16_sse2_aligned x264_template(pixel_sad_16x16_sse2_aligned)
#define x264_pixel_sad_16x16_sse3 x264_template(pixel_sad_16x16_sse3)
#define x264_pixel_sad_16x16_ssse3 x264_template(pixel_sad_16x16_ssse3)
#define x264_pixel_sad_16x16_ssse3_aligned x264_template(pixel_sad_16x16_ssse3_aligned)
#define x264_pixel_sad_16x8_avx2 x264_template(pixel_sad_16x8_avx2)
#define x264_pixel_sad_16x8_avx512 x264_template(pixel_sad_16x8_avx512)
#define x264_pixel_sad_16x8_cache32_mmx2 x264_template(pixel_sad_16x8_cache32_mmx2)
#define x264_pixel_sad_16x8_cache64_mmx2 x264_template(pixel_sad_16x8_cache64_mmx2)
#define x264_pixel_sad_16x8_cache64_sse2 x264_template(pixel_sad_16x8_cache64_sse2)
#define x264_pixel_sad_16x8_cache64_ssse3 x264_template(pixel_sad_16x8_cache64_ssse3)
#define x264_pixel_sad_16x8_mmx2 x264_template(pixel_sad_16x8_mmx2)
#define x264_pixel_sad_16x8_sse2 x264_template(pixel_sad_16x8_sse2)
#define x264_pixel_sad_16x8_sse2_aligned x264_template(pixel_sad_16x8_sse2_aligned)
#define x264_pixel_sad_16x8_sse3 x264_template(pixel_sad_16x8_sse3)
#define x264_pixel_sad_16x8_ssse3 x264_template(pixel_sad_16x8_ssse3)
#define x264_pixel_sad_16x8_ssse3_aligned x264_template(pixel_sad_16x8_ssse3_aligned)
#define x264_pixel_sad_4x16_avx512 x264_template(pixel_sad_4x16_avx512)
#define x264_pixel_sad_4x16_mmx2 x264_template(pixel_sad_4x16_mmx2)
#define x264_pixel_sad_4x4_avx512 x264_template(pixel_sad_4x4_avx512)
#define x264_pixel_sad_4x4_mmx2 x264_template(pixel_sad_4x4_mmx2)
#define x264_pixel_sad_4x4_ssse3 x264_template(pixel_sad_4x4_ssse3)
#define x264_pixel_sad_4x8_avx512 x264_template(pixel_sad_4x8_avx512)
#define x264_pixel_sad_4x8_mmx2 x264_template(pixel_sad_4x8_mmx2)
#define x264_pixel_sad_4x8_ssse3 x264_template(pixel_sad_4x8_ssse3)
#define x264_pixel_sad_8x16_avx512 x264_template(pixel_sad_8x16_avx512)
#define x264_pixel_sad_8x16_cache32_mmx2 x264_template(pixel_sad_8x16_cache32_mmx2)
#define x264_pixel_sad_8x16_cache64_mmx2 x264_template(pixel_sad_8x16_cache64_mmx2)
#define x264_pixel_sad_8x16_mmx2 x264_template(pixel_sad_8x16_mmx2)
#define x264_pixel_sad_8x16_sse2 x264_template(pixel_sad_8x16_sse2)
#define x264_pixel_sad_8x16_sse2_aligned x264_template(pixel_sad_8x16_sse2_aligned)
#define x264_pixel_sad_8x16_ssse3 x264_template(pixel_sad_8x16_ssse3)
#define x264_pixel_sad_8x16_ssse3_aligned x264_template(pixel_sad_8x16_ssse3_aligned)
#define x264_pixel_sad_8x4_avx512 x264_template(pixel_sad_8x4_avx512)
#define x264_pixel_sad_8x4_cache32_mmx2 x264_template(pixel_sad_8x4_cache32_mmx2)
#define x264_pixel_sad_8x4_cache64_mmx2 x264_template(pixel_sad_8x4_cache64_mmx2)
#define x264_pixel_sad_8x4_mmx2 x264_template(pixel_sad_8x4_mmx2)
#define x264_pixel_sad_8x4_sse2 x264_template(pixel_sad_8x4_sse2)
#define x264_pixel_sad_8x4_ssse3 x264_template(pixel_sad_8x4_ssse3)
#define x264_pixel_sad_8x8_avx512 x264_template(pixel_sad_8x8_avx512)
#define x264_pixel_sad_8x8_cache32_mmx2 x264_template(pixel_sad_8x8_cache32_mmx2)
#define x264_pixel_sad_8x8_cache64_mmx2 x264_template(pixel_sad_8x8_cache64_mmx2)
#define x264_pixel_sad_8x8_mmx2 x264_template(pixel_sad_8x8_mmx2)
#define x264_pixel_sad_8x8_sse2 x264_template(pixel_sad_8x8_sse2)
#define x264_pixel_sad_8x8_sse2_aligned x264_template(pixel_sad_8x8_sse2_aligned)
#define x264_pixel_sad_8x8_ssse3 x264_template(pixel_sad_8x8_ssse3)
#define x264_pixel_sad_8x8_ssse3_aligned x264_template(pixel_sad_8x8_ssse3_aligned)
#define x264_pixel_sad_x3_16x16_avx x264_template(pixel_sad_x3_16x16_avx)
#define x264_pixel_sad_x3_16x16_avx2 x264_template(pixel_sad_x3_16x16_avx2)
#define x264_pixel_sad_x3_16x16_avx512 x264_template(pixel_sad_x3_16x16_avx512)
#define x264_pixel_sad_x3_16x16_cache32_mmx2 x264_template(pixel_sad_x3_16x16_cache32_mmx2)
#define x264_pixel_sad_x3_16x16_cache64_mmx2 x264_template(pixel_sad_x3_16x16_cache64_mmx2)
#define x264_pixel_sad_x3_16x16_cache64_sse2 x264_template(pixel_sad_x3_16x16_cache64_sse2)
#define x264_pixel_sad_x3_16x16_cache64_ssse3 x264_template(pixel_sad_x3_16x16_cache64_ssse3)
#define x264_pixel_sad_x3_16x16_mmx2 x264_template(pixel_sad_x3_16x16_mmx2)
#define x264_pixel_sad_x3_16x16_sse2 x264_template(pixel_sad_x3_16x16_sse2)
#define x264_pixel_sad_x3_16x16_sse3 x264_template(pixel_sad_x3_16x16_sse3)
#define x264_pixel_sad_x3_16x16_ssse3 x264_template(pixel_sad_x3_16x16_ssse3)
#define x264_pixel_sad_x3_16x16_xop x264_template(pixel_sad_x3_16x16_xop)
#define x264_pixel_sad_x3_16x8_avx x264_template(pixel_sad_x3_16x8_avx)
#define x264_pixel_sad_x3_16x8_avx2 x264_template(pixel_sad_x3_16x8_avx2)
#define x264_pixel_sad_x3_16x8_avx512 x264_template(pixel_sad_x3_16x8_avx512)
#define x264_pixel_sad_x3_16x8_cache32_mmx2 x264_template(pixel_sad_x3_16x8_cache32_mmx2)
#define x264_pixel_sad_x3_16x8_cache64_mmx2 x264_template(pixel_sad_x3_16x8_cache64_mmx2)
#define x264_pixel_sad_x3_16x8_cache64_sse2 x264_template(pixel_sad_x3_16x8_cache64_sse2)
#define x264_pixel_sad_x3_16x8_cache64_ssse3 x264_template(pixel_sad_x3_16x8_cache64_ssse3)
#define x264_pixel_sad_x3_16x8_mmx2 x264_template(pixel_sad_x3_16x8_mmx2)
#define x264_pixel_sad_x3_16x8_sse2 x264_template(pixel_sad_x3_16x8_sse2)
#define x264_pixel_sad_x3_16x8_sse3 x264_template(pixel_sad_x3_16x8_sse3)
#define x264_pixel_sad_x3_16x8_ssse3 x264_template(pixel_sad_x3_16x8_ssse3)
#define x264_pixel_sad_x3_16x8_xop x264_template(pixel_sad_x3_16x8_xop)
#define x264_pixel_sad_x3_4x4_avx512 x264_template(pixel_sad_x3_4x4_avx512)
#define x264_pixel_sad_x3_4x4_mmx2 x264_template(pixel_sad_x3_4x4_mmx2)
#define x264_pixel_sad_x3_4x4_ssse3 x264_template(pixel_sad_x3_4x4_ssse3)
#define x264_pixel_sad_x3_4x8_avx512 x264_template(pixel_sad_x3_4x8_avx512)
#define x264_pixel_sad_x3_4x8_mmx2 x264_template(pixel_sad_x3_4x8_mmx2)
#define x264_pixel_sad_x3_4x8_ssse3 x264_template(pixel_sad_x3_4x8_ssse3)
#define x264_pixel_sad_x3_8x16_avx512 x264_template(pixel_sad_x3_8x16_avx512)
#define x264_pixel_sad_x3_8x16_cache32_mmx2 x264_template(pixel_sad_x3_8x16_cache32_mmx2)
#define x264_pixel_sad_x3_8x16_cache64_mmx2 x264_template(pixel_sad_x3_8x16_cache64_mmx2)
#define x264_pixel_sad_x3_8x16_cache64_sse2 x264_template(pixel_sad_x3_8x16_cache64_sse2)
#define x264_pixel_sad_x3_8x16_mmx2 x264_template(pixel_sad_x3_8x16_mmx2)
#define x264_pixel_sad_x3_8x16_sse2 x264_template(pixel_sad_x3_8x16_sse2)
#define x264_pixel_sad_x3_8x16_ssse3 x264_template(pixel_sad_x3_8x16_ssse3)
#define x264_pixel_sad_x3_8x16_xop x264_template(pixel_sad_x3_8x16_xop)
#define x264_pixel_sad_x3_8x4_avx512 x264_template(pixel_sad_x3_8x4_avx512)
#define x264_pixel_sad_x3_8x4_mmx2 x264_template(pixel_sad_x3_8x4_mmx2)
#define x264_pixel_sad_x3_8x4_sse2 x264_template(pixel_sad_x3_8x4_sse2)
#define x264_pixel_sad_x3_8x4_ssse3 x264_template(pixel_sad_x3_8x4_ssse3)
#define x264_pixel_sad_x3_8x4_xop x264_template(pixel_sad_x3_8x4_xop)
#define x264_pixel_sad_x3_8x8_avx512 x264_template(pixel_sad_x3_8x8_avx512)
#define x264_pixel_sad_x3_8x8_cache32_mmx2 x264_template(pixel_sad_x3_8x8_cache32_mmx2)
#define x264_pixel_sad_x3_8x8_cache64_mmx2 x264_template(pixel_sad_x3_8x8_cache64_mmx2)
#define x264_pixel_sad_x3_8x8_mmx2 x264_template(pixel_sad_x3_8x8_mmx2)
#define x264_pixel_sad_x3_8x8_sse2 x264_template(pixel_sad_x3_8x8_sse2)
#define x264_pixel_sad_x3_8x8_ssse3 x264_template(pixel_sad_x3_8x8_ssse3)
#define x264_pixel_sad_x3_8x8_xop x264_template(pixel_sad_x3_8x8_xop)
#define x264_pixel_sad_x4_16x16_avx x264_template(pixel_sad_x4_16x16_avx)
#define x264_pixel_sad_x4_16x16_avx2 x264_template(pixel_sad_x4_16x16_avx2)
#define x264_pixel_sad_x4_16x16_avx512 x264_template(pixel_sad_x4_16x16_avx512)
#define x264_pixel_sad_x4_16x16_cache32_mmx2 x264_template(pixel_sad_x4_16x16_cache32_mmx2)
#define x264_pixel_sad_x4_16x16_cache64_mmx2 x264_template(pixel_sad_x4_16x16_cache64_mmx2)
#define x264_pixel_sad_x4_16x16_cache64_sse2 x264_template(pixel_sad_x4_16x16_cache64_sse2)
#define x264_pixel_sad_x4_16x16_cache64_ssse3 x264_template(pixel_sad_x4_16x16_cache64_ssse3)
#define x264_pixel_sad_x4_16x16_mmx2 x264_template(pixel_sad_x4_16x16_mmx2)
#define x264_pixel_sad_x4_16x16_sse2 x264_template(pixel_sad_x4_16x16_sse2)
#define x264_pixel_sad_x4_16x16_sse3 x264_template(pixel_sad_x4_16x16_sse3)
#define x264_pixel_sad_x4_16x16_ssse3 x264_template(pixel_sad_x4_16x16_ssse3)
#define x264_pixel_sad_x4_16x16_xop x264_template(pixel_sad_x4_16x16_xop)
#define x264_pixel_sad_x4_16x8_avx x264_template(pixel_sad_x4_16x8_avx)
#define x264_pixel_sad_x4_16x8_avx2 x264_template(pixel_sad_x4_16x8_avx2)
#define x264_pixel_sad_x4_16x8_avx512 x264_template(pixel_sad_x4_16x8_avx512)
#define x264_pixel_sad_x4_16x8_cache32_mmx2 x264_template(pixel_sad_x4_16x8_cache32_mmx2)
#define x264_pixel_sad_x4_16x8_cache64_mmx2 x264_template(pixel_sad_x4_16x8_cache64_mmx2)
#define x264_pixel_sad_x4_16x8_cache64_sse2 x264_template(pixel_sad_x4_16x8_cache64_sse2)
#define x264_pixel_sad_x4_16x8_cache64_ssse3 x264_template(pixel_sad_x4_16x8_cache64_ssse3)
#define x264_pixel_sad_x4_16x8_mmx2 x264_template(pixel_sad_x4_16x8_mmx2)
#define x264_pixel_sad_x4_16x8_sse2 x264_template(pixel_sad_x4_16x8_sse2)
#define x264_pixel_sad_x4_16x8_sse3 x264_template(pixel_sad_x4_16x8_sse3)
#define x264_pixel_sad_x4_16x8_ssse3 x264_template(pixel_sad_x4_16x8_ssse3)
#define x264_pixel_sad_x4_16x8_xop x264_template(pixel_sad_x4_16x8_xop)
#define x264_pixel_sad_x4_4x4_avx512 x264_template(pixel_sad_x4_4x4_avx512)
#define x264_pixel_sad_x4_4x4_mmx2 x264_template(pixel_sad_x4_4x4_mmx2)
#define x264_pixel_sad_x4_4x4_ssse3 x264_template(pixel_sad_x4_4x4_ssse3)
#define x264_pixel_sad_x4_4x8_avx512 x264_template(pixel_sad_x4_4x8_avx512)
#define x264_pixel_sad_x4_4x8_mmx2 x264_template(pixel_sad_x4_4x8_mmx2)
#define x264_pixel_sad_x4_4x8_ssse3 x264_template(pixel_sad_x4_4x8_ssse3)
#define x264_pixel_sad_x4_8x16_avx512 x264_template(pixel_sad_x4_8x16_avx512)
#define x264_pixel_sad_x4_8x16_cache32_mmx2 x264_template(pixel_sad_x4_8x16_cache32_mmx2)
#define x264_pixel_sad_x4_8x16_cache64_mmx2 x264_template(pixel_sad_x4_8x16_cache64_mmx2)
#define x264_pixel_sad_x4_8x16_cache64_sse2 x264_template(pixel_sad_x4_8x16_cache64_sse2)
#define x264_pixel_sad_x4_8x16_mmx2 x264_template(pixel_sad_x4_8x16_mmx2)
#define x264_pixel_sad_x4_8x16_sse2 x264_template(pixel_sad_x4_8x16_sse2)
#define x264_pixel_sad_x4_8x16_ssse3 x264_template(pixel_sad_x4_8x16_ssse3)
#define x264_pixel_sad_x4_8x16_xop x264_template(pixel_sad_x4_8x16_xop)
#define x264_pixel_sad_x4_8x4_avx512 x264_template(pixel_sad_x4_8x4_avx512)
#define x264_pixel_sad_x4_8x4_mmx2 x264_template(pixel_sad_x4_8x4_mmx2)
#define x264_pixel_sad_x4_8x4_sse2 x264_template(pixel_sad_x4_8x4_sse2)
#define x264_pixel_sad_x4_8x4_ssse3 x264_template(pixel_sad_x4_8x4_ssse3)
#define x264_pixel_sad_x4_8x4_xop x264_template(pixel_sad_x4_8x4_xop)
#define x264_pixel_sad_x4_8x8_avx512 x264_template(pixel_sad_x4_8x8_avx512)
#define x264_pixel_sad_x4_8x8_cache32_mmx2 x264_template(pixel_sad_x4_8x8_cache32_mmx2)
#define x264_pixel_sad_x4_8x8_cache64_mmx2 x264_template(pixel_sad_x4_8x8_cache64_mmx2)
#define x264_pixel_sad_x4_8x8_mmx2 x264_template(pixel_sad_x4_8x8_mmx2)
#define x264_pixel_sad_x4_8x8_sse2 x264_template(pixel_sad_x4_8x8_sse2)
#define x264_pixel_sad_x4_8x8_ssse3 x264_template(pixel_sad_x4_8x8_ssse3)
#define x264_pixel_sad_x4_8x8_xop x264_template(pixel_sad_x4_8x8_xop)
#define x264_pixel_satd_16x16_avx x264_template(pixel_satd_16x16_avx)
#define x264_pixel_satd_16x16_avx2 x264_template(pixel_satd_16x16_avx2)
#define x264_pixel_satd_16x16_avx512 x264_template(pixel_satd_16x16_avx512)
#define x264_pixel_satd_16x16_mmx2 x264_template(pixel_satd_16x16_mmx2)
#define x264_pixel_satd_16x16_sse2 x264_template(pixel_satd_16x16_sse2)
#define x264_pixel_satd_16x16_sse4 x264_template(pixel_satd_16x16_sse4)
#define x264_pixel_satd_16x16_ssse3 x264_template(pixel_satd_16x16_ssse3)
#define x264_pixel_satd_16x16_ssse3_atom x264_template(pixel_satd_16x16_ssse3_atom)
#define x264_pixel_satd_16x16_xop x264_template(pixel_satd_16x16_xop)
#define x264_pixel_satd_16x8_avx x264_template(pixel_satd_16x8_avx)
#define x264_pixel_satd_16x8_avx2 x264_template(pixel_satd_16x8_avx2)
#define x264_pixel_satd_16x8_avx512 x264_template(pixel_satd_16x8_avx512)
#define x264_pixel_satd_16x8_mmx2 x264_template(pixel_satd_16x8_mmx2)
#define x264_pixel_satd_16x8_sse2 x264_template(pixel_satd_16x8_sse2)
#define x264_pixel_satd_16x8_sse4 x264_template(pixel_satd_16x8_sse4)
#define x264_pixel_satd_16x8_ssse3 x264_template(pixel_satd_16x8_ssse3)
#define x264_pixel_satd_16x8_ssse3_atom x264_template(pixel_satd_16x8_ssse3_atom)
#define x264_pixel_satd_16x8_xop x264_template(pixel_satd_16x8_xop)
#define x264_pixel_satd_4x16_avx x264_template(pixel_satd_4x16_avx)
#define x264_pixel_satd_4x16_avx512 x264_template(pixel_satd_4x16_avx512)
#define x264_pixel_satd_4x16_mmx2 x264_template(pixel_satd_4x16_mmx2)
#define x264_pixel_satd_4x16_sse2 x264_template(pixel_satd_4x16_sse2)
#define x264_pixel_satd_4x16_sse4 x264_template(pixel_satd_4x16_sse4)
#define x264_pixel_satd_4x16_ssse3 x264_template(pixel_satd_4x16_ssse3)
#define x264_pixel_satd_4x16_ssse3_atom x264_template(pixel_satd_4x16_ssse3_atom)
#define x264_pixel_satd_4x4_avx x264_template(pixel_satd_4x4_avx)
#define x264_pixel_satd_4x4_avx512 x264_template(pixel_satd_4x4_avx512)
#define x264_pixel_satd_4x4_mmx2 x264_template(pixel_satd_4x4_mmx2)
#define x264_pixel_satd_4x4_sse4 x264_template(pixel_satd_4x4_sse4)
#define x264_pixel_satd_4x4_ssse3 x264_template(pixel_satd_4x4_ssse3)
#define x264_pixel_satd_4x4_xop x264_template(pixel_satd_4x4_xop)
#define x264_pixel_satd_4x8_avx x264_template(pixel_satd_4x8_avx)
#define x264_pixel_satd_4x8_avx512 x264_template(pixel_satd_4x8_avx512)
#define x264_pixel_satd_4x8_mmx2 x264_template(pixel_satd_4x8_mmx2)
#define x264_pixel_satd_4x8_sse2 x264_template(pixel_satd_4x8_sse2)
#define x264_pixel_satd_4x8_sse4 x264_template(pixel_satd_4x8_sse4)
#define x264_pixel_satd_4x8_ssse3 x264_template(pixel_satd_4x8_ssse3)
#define x264_pixel_satd_4x8_ssse3_atom x264_template(pixel_satd_4x8_ssse3_atom)
#define x264_pixel_satd_4x8_xop x264_template(pixel_satd_4x8_xop)
#define x264_pixel_satd_8x16_avx x264_template(pixel_satd_8x16_avx)
#define x264_pixel_satd_8x16_avx2 x264_template(pixel_satd_8x16_avx2)
#define x264_pixel_satd_8x16_avx512 x264_template(pixel_satd_8x16_avx512)
#define x264_pixel_satd_8x16_mmx2 x264_template(pixel_satd_8x16_mmx2)
#define x264_pixel_satd_8x16_sse2 x264_template(pixel_satd_8x16_sse2)
#define x264_pixel_satd_8x16_sse4 x264_template(pixel_satd_8x16_sse4)
#define x264_pixel_satd_8x16_ssse3 x264_template(pixel_satd_8x16_ssse3)
#define x264_pixel_satd_8x16_ssse3_atom x264_template(pixel_satd_8x16_ssse3_atom)
#define x264_pixel_satd_8x16_xop x264_template(pixel_satd_8x16_xop)
#define x264_pixel_satd_8x4_avx x264_template(pixel_satd_8x4_avx)
#define x264_pixel_satd_8x4_avx512 x264_template(pixel_satd_8x4_avx512)
#define x264_pixel_satd_8x4_mmx2 x264_template(pixel_satd_8x4_mmx2)
#define x264_pixel_satd_8x4_sse2 x264_template(pixel_satd_8x4_sse2)
#define x264_pixel_satd_8x4_sse4 x264_template(pixel_satd_8x4_sse4)
#define x264_pixel_satd_8x4_ssse3 x264_template(pixel_satd_8x4_ssse3)
#define x264_pixel_satd_8x4_ssse3_atom x264_template(pixel_satd_8x4_ssse3_atom)
#define x264_pixel_satd_8x4_xop x264_template(pixel_satd_8x4_xop)
#define x264_pixel_satd_8x8_avx x264_template(pixel_satd_8x8_avx)
#define x264_pixel_satd_8x8_avx2 x264_template(pixel_satd_8x8_avx2)
#define x264_pixel_satd_8x8_avx512 x264_template(pixel_satd_8x8_avx512)
#define x264_pixel_satd_8x8_mmx2 x264_template(pixel_satd_8x8_mmx2)
#define x264_pixel_satd_8x8_sse2 x264_template(pixel_satd_8x8_sse2)
#define x264_pixel_satd_8x8_sse4 x264_template(pixel_satd_8x8_sse4)
#define x264_pixel_satd_8x8_ssse3 x264_template(pixel_satd_8x8_ssse3)
#define x264_pixel_satd_8x8_ssse3_atom x264_template(pixel_satd_8x8_ssse3_atom)
#define x264_pixel_satd_8x8_xop x264_template(pixel_satd_8x8_xop)
#define x264_pixel_ssd_16x16_avx x264_template(pixel_ssd_16x16_avx)
#define x264_pixel_ssd_16x16_avx2 x264_template(pixel_ssd_16x16_avx2)
#define x264_pixel_ssd_16x16_mmx x264_template(pixel_ssd_16x16_mmx)
#define x264_pixel_ssd_16x16_mmx2 x264_template(pixel_ssd_16x16_mmx2)
#define x264_pixel_ssd_16x16_sse2 x264_template(pixel_ssd_16x16_sse2)
#define x264_pixel_ssd_16x16_sse2slow x264_template(pixel_ssd_16x16_sse2slow)
#define x264_pixel_ssd_16x16_ssse3 x264_template(pixel_ssd_16x16_ssse3)
#define x264_pixel_ssd_16x16_xop x264_template(pixel_ssd_16x16_xop)
#define x264_pixel_ssd_16x8_avx x264_template(pixel_ssd_16x8_avx)
#define x264_pixel_ssd_16x8_avx2 x264_template(pixel_ssd_16x8_avx2)
#define x264_pixel_ssd_16x8_mmx x264_template(pixel_ssd_16x8_mmx)
#define x264_pixel_ssd_16x8_mmx2 x264_template(pixel_ssd_16x8_mmx2)
#define x264_pixel_ssd_16x8_sse2 x264_template(pixel_ssd_16x8_sse2)
#define x264_pixel_ssd_16x8_sse2slow x264_template(pixel_ssd_16x8_sse2slow)
#define x264_pixel_ssd_16x8_ssse3 x264_template(pixel_ssd_16x8_ssse3)
#define x264_pixel_ssd_16x8_xop x264_template(pixel_ssd_16x8_xop)
#define x264_pixel_ssd_4x16_mmx x264_template(pixel_ssd_4x16_mmx)
#define x264_pixel_ssd_4x16_mmx2 x264_template(pixel_ssd_4x16_mmx2)
#define x264_pixel_ssd_4x16_ssse3 x264_template(pixel_ssd_4x16_ssse3)
#define x264_pixel_ssd_4x4_mmx x264_template(pixel_ssd_4x4_mmx)
#define x264_pixel_ssd_4x4_mmx2 x264_template(pixel_ssd_4x4_mmx2)
#define x264_pixel_ssd_4x4_ssse3 x264_template(pixel_ssd_4x4_ssse3)
#define x264_pixel_ssd_4x8_mmx x264_template(pixel_ssd_4x8_mmx)
#define x264_pixel_ssd_4x8_mmx2 x264_template(pixel_ssd_4x8_mmx2)
#define x264_pixel_ssd_4x8_ssse3 x264_template(pixel_ssd_4x8_ssse3)
#define x264_pixel_ssd_8x16_avx x264_template(pixel_ssd_8x16_avx)
#define x264_pixel_ssd_8x16_mmx x264_template(pixel_ssd_8x16_mmx)
#define x264_pixel_ssd_8x16_mmx2 x264_template(pixel_ssd_8x16_mmx2)
#define x264_pixel_ssd_8x16_sse2 x264_template(pixel_ssd_8x16_sse2)
#define x264_pixel_ssd_8x16_sse2slow x264_template(pixel_ssd_8x16_sse2slow)
#define x264_pixel_ssd_8x16_ssse3 x264_template(pixel_ssd_8x16_ssse3)
#define x264_pixel_ssd_8x16_xop x264_template(pixel_ssd_8x16_xop)
#define x264_pixel_ssd_8x4_avx x264_template(pixel_ssd_8x4_avx)
#define x264_pixel_ssd_8x4_mmx x264_template(pixel_ssd_8x4_mmx)
#define x264_pixel_ssd_8x4_mmx2 x264_template(pixel_ssd_8x4_mmx2)
#define x264_pixel_ssd_8x4_sse2 x264_template(pixel_ssd_8x4_sse2)
#define x264_pixel_ssd_8x4_sse2slow x264_template(pixel_ssd_8x4_sse2slow)
#define x264_pixel_ssd_8x4_ssse3 x264_template(pixel_ssd_8x4_ssse3)
#define x264_pixel_ssd_8x4_xop x264_template(pixel_ssd_8x4_xop)
#define x264_pixel_ssd_8x8_avx x264_template(pixel_ssd_8x8_avx)
#define x264_pixel_ssd_8x8_mmx x264_template(pixel_ssd_8x8_mmx)
#define x264_pixel_ssd_8x8_mmx2 x264_template(pixel_ssd_8x8_mmx2)
#define x264_pixel_ssd_8x8_sse2 x264_template(pixel_ssd_8x8_sse2)
#define x264_pixel_ssd_8x8_sse2slow x264_template(pixel_ssd_8x8_sse2slow)
#define x264_pixel_ssd_8x8_ssse3 x264_template(pixel_ssd_8x8_ssse3)
#define x264_pixel_ssd_8x8_xop x264_template(pixel_ssd_8x8_xop)
#define x264_pixel_var_16x16_avx x264_template(pixel_var_16x16_avx)
#define x264_pixel_var_16x16_avx2 x264_template(pixel_var_16x16_avx2)
#define x264_pixel_var_16x16_avx512 x264_template(pixel_var_16x16_avx512)
#define x264_pixel_var_16x16_sse2 x264_template(pixel_var_16x16_sse2)
#define x264_pixel_var_8x16_avx x264_template(pixel_var_8x16_avx)
#define x264_pixel_var_8x16_avx512 x264_template(pixel_var_8x16_avx512)
#define x264_pixel_var_8x16_sse2 x264_template(pixel_var_8x16_sse2)
#define x264_pixel_var_8x8_avx x264_template(pixel_var_8x8_avx)
#define x264_pixel_var_8x8_avx512 x264_template(pixel_var_8x8_avx512)
#define x264_pixel_var_8x8_sse2 x264_template(pixel_var_8x8_sse2)
#define DECL_PIXELS( ret, name, suffix, args ) \
ret x264_pixel_##name##_16x16_##suffix args;\
ret x264_pixel_##name##_16x8_##suffix args;\
ret x264_pixel_##name##_8x16_##suffix args;\
ret x264_pixel_##name##_8x8_##suffix args;\
ret x264_pixel_##name##_8x4_##suffix args;\
ret x264_pixel_##name##_4x16_##suffix args;\
ret x264_pixel_##name##_4x8_##suffix args;\
ret x264_pixel_##name##_4x4_##suffix args;\
#define DECL_X1( name, suffix ) \
DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
#define DECL_X4( name, suffix ) \
DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
DECL_X1( sad, mmx2 )
DECL_X1( sad, sse2 )
DECL_X1( sad, sse3 )
DECL_X1( sad, sse2_aligned )
DECL_X1( sad, ssse3 )
DECL_X1( sad, ssse3_aligned )
DECL_X1( sad, avx2 )
DECL_X1( sad, avx512 )
DECL_X4( sad, mmx2 )
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
DECL_X4( sad, ssse3 )
DECL_X4( sad, xop )
DECL_X4( sad, avx )
DECL_X4( sad, avx2 )
DECL_X4( sad, avx512 )
DECL_X1( ssd, mmx )
DECL_X1( ssd, mmx2 )
DECL_X1( ssd, sse2slow )
DECL_X1( ssd, sse2 )
DECL_X1( ssd, ssse3 )
DECL_X1( ssd, avx )
DECL_X1( ssd, xop )
DECL_X1( ssd, avx2 )
DECL_X1( satd, mmx2 )
DECL_X1( satd, sse2 )
DECL_X1( satd, ssse3 )
DECL_X1( satd, ssse3_atom )
DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
DECL_X1( satd, xop )
DECL_X1( satd, avx2 )
DECL_X1( satd, avx512 )
DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
DECL_X1( sa8d, ssse3_atom )
DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
DECL_X1( sa8d, xop )
DECL_X1( sa8d, avx2 )
DECL_X1( sa8d, avx512 )
DECL_X1( sad, cache32_mmx2 );
DECL_X1( sad, cache64_mmx2 );
DECL_X1( sad, cache64_sse2 );
DECL_X1( sad, cache64_ssse3 );
DECL_X4( sad, cache32_mmx2 );
DECL_X4( sad, cache64_mmx2 );
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx512, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3_atom, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, avx, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, avx2, ( pixel *pix, intptr_t i_stride ))
#define x264_intra_satd_x3_4x4_mmx2 x264_template(intra_satd_x3_4x4_mmx2)
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
#define x264_intra_sad_x3_4x4_mmx2 x264_template(intra_sad_x3_4x4_mmx2)
void x264_intra_sad_x3_4x4_mmx2 ( uint8_t *, uint8_t *, int * );
#define x264_intra_sad_x3_4x4_sse2 x264_template(intra_sad_x3_4x4_sse2)
void x264_intra_sad_x3_4x4_sse2 ( uint16_t*, uint16_t*, int * );
#define x264_intra_sad_x3_4x4_ssse3 x264_template(intra_sad_x3_4x4_ssse3)
void x264_intra_sad_x3_4x4_ssse3 ( uint16_t*, uint16_t*, int * );
#define x264_intra_sad_x3_4x4_avx x264_template(intra_sad_x3_4x4_avx)
void x264_intra_sad_x3_4x4_avx ( uint16_t*, uint16_t*, int * );
#define x264_intra_satd_x3_8x8c_mmx2 x264_template(intra_satd_x3_8x8c_mmx2)
void x264_intra_satd_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
#define x264_intra_satd_x3_8x8c_ssse3 x264_template(intra_satd_x3_8x8c_ssse3)
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
#define x264_intra_sad_x3_8x8c_mmx2 x264_template(intra_sad_x3_8x8c_mmx2)
void x264_intra_sad_x3_8x8c_mmx2 ( uint8_t *, uint8_t *, int * );
#define x264_intra_sad_x3_8x8c_ssse3 x264_template(intra_sad_x3_8x8c_ssse3)
void x264_intra_sad_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
#define x264_intra_sad_x3_8x8c_avx2 x264_template(intra_sad_x3_8x8c_avx2)
void x264_intra_sad_x3_8x8c_avx2 ( uint8_t *, uint8_t *, int * );
#define x264_intra_satd_x3_16x16_mmx2 x264_template(intra_satd_x3_16x16_mmx2)
void x264_intra_satd_x3_16x16_mmx2 ( pixel *, pixel *, int * );
#define x264_intra_satd_x3_16x16_ssse3 x264_template(intra_satd_x3_16x16_ssse3)
void x264_intra_satd_x3_16x16_ssse3( uint8_t *, uint8_t *, int * );
#define x264_intra_sad_x3_16x16_mmx2 x264_template(intra_sad_x3_16x16_mmx2)
void x264_intra_sad_x3_16x16_mmx2 ( uint8_t *, uint8_t *, int * );
#define x264_intra_sad_x3_16x16_sse2 x264_template(intra_sad_x3_16x16_sse2)
void x264_intra_sad_x3_16x16_sse2 ( uint8_t *, uint8_t *, int * );
#define x264_intra_sad_x3_16x16_ssse3 x264_template(intra_sad_x3_16x16_ssse3)
void x264_intra_sad_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
#define x264_intra_sad_x3_16x16_avx2 x264_template(intra_sad_x3_16x16_avx2)
void x264_intra_sad_x3_16x16_avx2 ( uint8_t *, uint8_t *, int * );
#define x264_intra_sa8d_x3_8x8_mmx2 x264_template(intra_sa8d_x3_8x8_mmx2)
void x264_intra_sa8d_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
#define x264_intra_sa8d_x3_8x8_sse2 x264_template(intra_sa8d_x3_8x8_sse2)
void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t *, uint8_t *, int * );
#define x264_intra_sad_x3_8x8_mmx2 x264_template(intra_sad_x3_8x8_mmx2)
void x264_intra_sad_x3_8x8_mmx2 ( uint8_t *, uint8_t *, int * );
#define x264_intra_sad_x3_8x8_sse2 x264_template(intra_sad_x3_8x8_sse2)
void x264_intra_sad_x3_8x8_sse2 ( uint16_t*, uint16_t*, int * );
#define x264_intra_sad_x3_8x8_ssse3 x264_template(intra_sad_x3_8x8_ssse3)
void x264_intra_sad_x3_8x8_ssse3 ( uint16_t*, uint16_t*, int * );
#define x264_intra_sad_x3_8x8_avx2 x264_template(intra_sad_x3_8x8_avx2)
void x264_intra_sad_x3_8x8_avx2 ( uint16_t*, uint16_t*, int * );
#define x264_intra_satd_x9_4x4_ssse3 x264_template(intra_satd_x9_4x4_ssse3)
int x264_intra_satd_x9_4x4_ssse3( uint8_t *, uint8_t *, uint16_t * );
#define x264_intra_satd_x9_4x4_sse4 x264_template(intra_satd_x9_4x4_sse4)
int x264_intra_satd_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
#define x264_intra_satd_x9_4x4_avx x264_template(intra_satd_x9_4x4_avx)
int x264_intra_satd_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
#define x264_intra_satd_x9_4x4_xop x264_template(intra_satd_x9_4x4_xop)
int x264_intra_satd_x9_4x4_xop ( uint8_t *, uint8_t *, uint16_t * );
#define x264_intra_sad_x9_4x4_ssse3 x264_template(intra_sad_x9_4x4_ssse3)
int x264_intra_sad_x9_4x4_ssse3 ( uint8_t *, uint8_t *, uint16_t * );
#define x264_intra_sad_x9_4x4_sse4 x264_template(intra_sad_x9_4x4_sse4)
int x264_intra_sad_x9_4x4_sse4 ( uint8_t *, uint8_t *, uint16_t * );
#define x264_intra_sad_x9_4x4_avx x264_template(intra_sad_x9_4x4_avx)
int x264_intra_sad_x9_4x4_avx ( uint8_t *, uint8_t *, uint16_t * );
#define x264_intra_sa8d_x9_8x8_ssse3 x264_template(intra_sa8d_x9_8x8_ssse3)
int x264_intra_sa8d_x9_8x8_ssse3( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
#define x264_intra_sa8d_x9_8x8_sse4 x264_template(intra_sa8d_x9_8x8_sse4)
int x264_intra_sa8d_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
#define x264_intra_sa8d_x9_8x8_avx x264_template(intra_sa8d_x9_8x8_avx)
int x264_intra_sa8d_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
#define x264_intra_sad_x9_8x8_ssse3 x264_template(intra_sad_x9_8x8_ssse3)
int x264_intra_sad_x9_8x8_ssse3 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
#define x264_intra_sad_x9_8x8_sse4 x264_template(intra_sad_x9_8x8_sse4)
int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
#define x264_intra_sad_x9_8x8_avx x264_template(intra_sad_x9_8x8_avx)
int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
#define x264_intra_sad_x9_8x8_avx2 x264_template(intra_sad_x9_8x8_avx2)
int x264_intra_sad_x9_8x8_avx2 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
#define x264_pixel_ssd_nv12_core_sse2 x264_template(pixel_ssd_nv12_core_sse2)
void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
#define x264_pixel_ssd_nv12_core_avx x264_template(pixel_ssd_nv12_core_avx)
void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
#define x264_pixel_ssd_nv12_core_xop x264_template(pixel_ssd_nv12_core_xop)
void x264_pixel_ssd_nv12_core_xop ( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
#define x264_pixel_ssd_nv12_core_avx2 x264_template(pixel_ssd_nv12_core_avx2)
void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
#define x264_pixel_ssim_4x4x2_core_mmx2 x264_template(pixel_ssim_4x4x2_core_mmx2)
void x264_pixel_ssim_4x4x2_core_mmx2( const uint8_t *pix1, intptr_t stride1,
const uint8_t *pix2, intptr_t stride2, int sums[2][4] );
#define x264_pixel_ssim_4x4x2_core_sse2 x264_template(pixel_ssim_4x4x2_core_sse2)
void x264_pixel_ssim_4x4x2_core_sse2( const pixel *pix1, intptr_t stride1,
const pixel *pix2, intptr_t stride2, int sums[2][4] );
#define x264_pixel_ssim_4x4x2_core_avx x264_template(pixel_ssim_4x4x2_core_avx)
void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1,
const pixel *pix2, intptr_t stride2, int sums[2][4] );
#define x264_pixel_ssim_end4_sse2 x264_template(pixel_ssim_end4_sse2)
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
#define x264_pixel_ssim_end4_avx x264_template(pixel_ssim_end4_avx)
float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
#define x264_pixel_var2_8x8_sse2 x264_template(pixel_var2_8x8_sse2)
int x264_pixel_var2_8x8_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
#define x264_pixel_var2_8x8_ssse3 x264_template(pixel_var2_8x8_ssse3)
int x264_pixel_var2_8x8_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
#define x264_pixel_var2_8x8_avx2 x264_template(pixel_var2_8x8_avx2)
int x264_pixel_var2_8x8_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
#define x264_pixel_var2_8x8_avx512 x264_template(pixel_var2_8x8_avx512)
int x264_pixel_var2_8x8_avx512 ( pixel *fenc, pixel *fdec, int ssd[2] );
#define x264_pixel_var2_8x16_sse2 x264_template(pixel_var2_8x16_sse2)
int x264_pixel_var2_8x16_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
#define x264_pixel_var2_8x16_ssse3 x264_template(pixel_var2_8x16_ssse3)
int x264_pixel_var2_8x16_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
#define x264_pixel_var2_8x16_avx2 x264_template(pixel_var2_8x16_avx2)
int x264_pixel_var2_8x16_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
#define x264_pixel_var2_8x16_avx512 x264_template(pixel_var2_8x16_avx512)
int x264_pixel_var2_8x16_avx512( pixel *fenc, pixel *fdec, int ssd[2] );
#define x264_pixel_vsad_mmx2 x264_template(pixel_vsad_mmx2)
int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
#define x264_pixel_vsad_sse2 x264_template(pixel_vsad_sse2)
int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
#define x264_pixel_vsad_ssse3 x264_template(pixel_vsad_ssse3)
int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
#define x264_pixel_vsad_xop x264_template(pixel_vsad_xop)
int x264_pixel_vsad_xop ( pixel *src, intptr_t stride, int height );
#define x264_pixel_vsad_avx2 x264_template(pixel_vsad_avx2)
int x264_pixel_vsad_avx2 ( uint16_t *src, intptr_t stride, int height );
#define x264_pixel_asd8_sse2 x264_template(pixel_asd8_sse2)
int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
#define x264_pixel_asd8_ssse3 x264_template(pixel_asd8_ssse3)
int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
#define x264_pixel_asd8_xop x264_template(pixel_asd8_xop)
int x264_pixel_asd8_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
#define x264_pixel_sa8d_satd_16x16_sse2 x264_template(pixel_sa8d_satd_16x16_sse2)
uint64_t x264_pixel_sa8d_satd_16x16_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
#define x264_pixel_sa8d_satd_16x16_ssse3 x264_template(pixel_sa8d_satd_16x16_ssse3)
uint64_t x264_pixel_sa8d_satd_16x16_ssse3 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
#define x264_pixel_sa8d_satd_16x16_ssse3_atom x264_template(pixel_sa8d_satd_16x16_ssse3_atom)
uint64_t x264_pixel_sa8d_satd_16x16_ssse3_atom( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
#define x264_pixel_sa8d_satd_16x16_sse4 x264_template(pixel_sa8d_satd_16x16_sse4)
uint64_t x264_pixel_sa8d_satd_16x16_sse4 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
#define x264_pixel_sa8d_satd_16x16_avx x264_template(pixel_sa8d_satd_16x16_avx)
uint64_t x264_pixel_sa8d_satd_16x16_avx ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
#define x264_pixel_sa8d_satd_16x16_xop x264_template(pixel_sa8d_satd_16x16_xop)
uint64_t x264_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
#define x264_pixel_sa8d_satd_16x16_avx2 x264_template(pixel_sa8d_satd_16x16_avx2)
uint64_t x264_pixel_sa8d_satd_16x16_avx2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
uint16_t *cost_mvx, int16_t *mvs, int width, int thresh );
DECL_ADS( 4, mmx2 )
DECL_ADS( 2, mmx2 )
DECL_ADS( 1, mmx2 )
DECL_ADS( 4, sse2 )
DECL_ADS( 2, sse2 )
DECL_ADS( 1, sse2 )
DECL_ADS( 4, ssse3 )
DECL_ADS( 2, ssse3 )
DECL_ADS( 1, ssse3 )
DECL_ADS( 4, avx )
DECL_ADS( 2, avx )
DECL_ADS( 1, avx )
DECL_ADS( 4, avx2 )
DECL_ADS( 2, avx2 )
DECL_ADS( 1, avx2 )
#undef DECL_PIXELS
#undef DECL_X1
#undef DECL_X4
#undef DECL_ADS
#endif

2181
common/x86/predict-a.asm Normal file

File diff suppressed because it is too large Load Diff

620
common/x86/predict-c.c Normal file
View File

@@ -0,0 +1,620 @@
/*****************************************************************************
* predict-c.c: intra prediction
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
#include "predict.h"
#include "pixel.h"
#define PREDICT_P_SUM(j,i)\
H += i * ( src[j+i - FDEC_STRIDE ] - src[j-i - FDEC_STRIDE ] );\
V += i * ( src[(j+i)*FDEC_STRIDE -1] - src[(j-i)*FDEC_STRIDE -1] );
#if HAVE_X86_INLINE_ASM
#if HIGH_BIT_DEPTH
ALIGNED_16( static const int16_t pw_12345678[8] ) = {1,2,3,4,5,6,7,8};
ALIGNED_16( static const int16_t pw_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
ALIGNED_16( static const int16_t pw_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
#else // !HIGH_BIT_DEPTH
ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
#endif // HIGH_BIT_DEPTH
#endif // HAVE_X86_INLINE_ASM
#define PREDICT_16x16_P_CORE\
int H = 0;\
int V = 0;\
PREDICT_P_SUM(7,1)\
PREDICT_P_SUM(7,2)\
PREDICT_P_SUM(7,3)\
PREDICT_P_SUM(7,4)\
PREDICT_P_SUM(7,5)\
PREDICT_P_SUM(7,6)\
PREDICT_P_SUM(7,7)\
PREDICT_P_SUM(7,8)
#define PREDICT_16x16_P_END(name)\
int a = 16 * ( src[15*FDEC_STRIDE -1] + src[15 - FDEC_STRIDE] );\
int b = ( 5 * H + 32 ) >> 6;\
int c = ( 5 * V + 32 ) >> 6;\
int i00 = a - b * 7 - c * 7 + 16;\
/* b*15 + c*15 can overflow: it's easier to just branch away in this rare case\
* than to try to consider it in the asm. */\
if( BIT_DEPTH > 8 && (i00 > 0x7fff || abs(b) > 1092 || abs(c) > 1092) )\
x264_predict_16x16_p_c( src );\
else\
x264_predict_16x16_p_core_##name( src, i00, b, c );
#define PREDICT_16x16_P(name, name2)\
static void predict_16x16_p_##name( pixel *src )\
{\
PREDICT_16x16_P_CORE\
PREDICT_16x16_P_END(name2)\
}
#if HAVE_X86_INLINE_ASM
#if HIGH_BIT_DEPTH
#define PREDICT_16x16_P_ASM\
asm (\
"movdqu %1, %%xmm1 \n"\
"movdqa %2, %%xmm0 \n"\
"pmaddwd %3, %%xmm0 \n"\
"pmaddwd %4, %%xmm1 \n"\
"paddd %%xmm1, %%xmm0 \n"\
"movhlps %%xmm0, %%xmm1 \n"\
"paddd %%xmm1, %%xmm0 \n"\
"pshuflw $14, %%xmm0, %%xmm1 \n"\
"paddd %%xmm1, %%xmm0 \n"\
"movd %%xmm0, %0 \n"\
:"=r"(H)\
:"m"(MEM_FIX(&src[-FDEC_STRIDE-1], const pixel, 8)),\
"m"(MEM_FIX(&src[-FDEC_STRIDE+8], const pixel, 8)),\
"m"(MEM_FIX(pw_12345678, const int16_t, 8)),\
"m"(MEM_FIX(pw_m87654321, const int16_t, 8))\
:"xmm0", "xmm1"\
);
#else // !HIGH_BIT_DEPTH
#define PREDICT_16x16_P_ASM\
asm (\
"movq %1, %%mm1 \n"\
"movq %2, %%mm0 \n"\
"palignr $7, %3, %%mm1 \n"\
"pmaddubsw %4, %%mm0 \n"\
"pmaddubsw %5, %%mm1 \n"\
"paddw %%mm1, %%mm0 \n"\
"pshufw $14, %%mm0, %%mm1 \n"\
"paddw %%mm1, %%mm0 \n"\
"pshufw $1, %%mm0, %%mm1 \n"\
"paddw %%mm1, %%mm0 \n"\
"movd %%mm0, %0 \n"\
"movswl %w0, %0 \n"\
:"=r"(H)\
:"m"(MEM_FIX(&src[-FDEC_STRIDE], const pixel, 8)),\
"m"(MEM_FIX(&src[-FDEC_STRIDE+8], const pixel, 8)),\
"m"(MEM_FIX(&src[-FDEC_STRIDE-8], const pixel, 8)),\
"m"(MEM_FIX(pb_12345678, const int8_t, 8)),\
"m"(MEM_FIX(pb_m87654321, const int8_t, 8))\
:"mm0", "mm1"\
);
#endif // HIGH_BIT_DEPTH
#define PREDICT_16x16_P_CORE_INLINE\
int H, V;\
PREDICT_16x16_P_ASM\
V = 8 * ( src[15*FDEC_STRIDE-1] - src[-1*FDEC_STRIDE-1] )\
+ 7 * ( src[14*FDEC_STRIDE-1] - src[ 0*FDEC_STRIDE-1] )\
+ 6 * ( src[13*FDEC_STRIDE-1] - src[ 1*FDEC_STRIDE-1] )\
+ 5 * ( src[12*FDEC_STRIDE-1] - src[ 2*FDEC_STRIDE-1] )\
+ 4 * ( src[11*FDEC_STRIDE-1] - src[ 3*FDEC_STRIDE-1] )\
+ 3 * ( src[10*FDEC_STRIDE-1] - src[ 4*FDEC_STRIDE-1] )\
+ 2 * ( src[ 9*FDEC_STRIDE-1] - src[ 5*FDEC_STRIDE-1] )\
+ 1 * ( src[ 8*FDEC_STRIDE-1] - src[ 6*FDEC_STRIDE-1] );
#define PREDICT_16x16_P_INLINE(name, name2)\
static void predict_16x16_p_##name( pixel *src )\
{\
PREDICT_16x16_P_CORE_INLINE\
PREDICT_16x16_P_END(name2)\
}
#else // !HAVE_X86_INLINE_ASM
#define PREDICT_16x16_P_INLINE(name, name2) PREDICT_16x16_P(name, name2)
#endif // HAVE_X86_INLINE_ASM
#if HIGH_BIT_DEPTH
PREDICT_16x16_P_INLINE( sse2, sse2 )
#else // !HIGH_BIT_DEPTH
#if !ARCH_X86_64
PREDICT_16x16_P( mmx2, mmx2 )
#endif // !ARCH_X86_64
PREDICT_16x16_P( sse2, sse2 )
#if HAVE_X86_INLINE_ASM
PREDICT_16x16_P_INLINE( ssse3, sse2 )
#endif // HAVE_X86_INLINE_ASM
PREDICT_16x16_P_INLINE( avx, avx )
#endif // HIGH_BIT_DEPTH
PREDICT_16x16_P_INLINE( avx2, avx2 )
#define PREDICT_8x16C_P_CORE\
int H = 0, V = 0;\
for( int i = 0; i < 4; i++ )\
H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );\
for( int i = 0; i < 8; i++ )\
V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );
#if HIGH_BIT_DEPTH
#define PREDICT_8x16C_P_END(name)\
int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
int b = ( 17 * H + 16 ) >> 5;\
int c = ( 5 * V + 32 ) >> 6;\
x264_predict_8x16c_p_core_##name( src, a, b, c );
#else // !HIGH_BIT_DEPTH
#define PREDICT_8x16C_P_END(name)\
int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );\
int b = ( 17 * H + 16 ) >> 5;\
int c = ( 5 * V + 32 ) >> 6;\
int i00 = a -3*b -7*c + 16;\
x264_predict_8x16c_p_core_##name( src, i00, b, c );
#endif // HIGH_BIT_DEPTH
#define PREDICT_8x16C_P(name)\
static void predict_8x16c_p_##name( pixel *src )\
{\
PREDICT_8x16C_P_CORE\
PREDICT_8x16C_P_END(name)\
}
#if !ARCH_X86_64 && !HIGH_BIT_DEPTH
PREDICT_8x16C_P( mmx2 )
#endif // !ARCH_X86_64 && !HIGH_BIT_DEPTH
PREDICT_8x16C_P( sse2 )
PREDICT_8x16C_P( avx )
PREDICT_8x16C_P( avx2 )
#define PREDICT_8x8C_P_CORE\
int H = 0;\
int V = 0;\
PREDICT_P_SUM(3,1)\
PREDICT_P_SUM(3,2)\
PREDICT_P_SUM(3,3)\
PREDICT_P_SUM(3,4)
#if HIGH_BIT_DEPTH
#define PREDICT_8x8C_P_END(name)\
int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
int b = ( 17 * H + 16 ) >> 5;\
int c = ( 17 * V + 16 ) >> 5;\
x264_predict_8x8c_p_core_##name( src, a, b, c );
#else // !HIGH_BIT_DEPTH
#define PREDICT_8x8C_P_END(name)\
int a = 16 * ( src[7*FDEC_STRIDE -1] + src[7 - FDEC_STRIDE] );\
int b = ( 17 * H + 16 ) >> 5;\
int c = ( 17 * V + 16 ) >> 5;\
int i00 = a -3*b -3*c + 16;\
x264_predict_8x8c_p_core_##name( src, i00, b, c );
#endif // HIGH_BIT_DEPTH
#define PREDICT_8x8C_P(name, name2)\
static void predict_8x8c_p_##name( pixel *src )\
{\
PREDICT_8x8C_P_CORE\
PREDICT_8x8C_P_END(name2)\
}
#if HAVE_X86_INLINE_ASM
#if HIGH_BIT_DEPTH
#define PREDICT_8x8C_P_ASM\
asm (\
"movdqa %1, %%xmm0 \n"\
"pmaddwd %2, %%xmm0 \n"\
"movhlps %%xmm0, %%xmm1 \n"\
"paddd %%xmm1, %%xmm0 \n"\
"pshuflw $14, %%xmm0, %%xmm1 \n"\
"paddd %%xmm1, %%xmm0 \n"\
"movd %%xmm0, %0 \n"\
:"=r"(H)\
:"m"(MEM_FIX(&src[-FDEC_STRIDE], const pixel, 8)),\
"m"(MEM_FIX(pw_m32101234, const int16_t, 8))\
:"xmm0", "xmm1"\
);
#else // !HIGH_BIT_DEPTH
#define PREDICT_8x8C_P_ASM\
asm (\
"movq %1, %%mm0 \n"\
"pmaddubsw %2, %%mm0 \n"\
"pshufw $14, %%mm0, %%mm1 \n"\
"paddw %%mm1, %%mm0 \n"\
"pshufw $1, %%mm0, %%mm1 \n"\
"paddw %%mm1, %%mm0 \n"\
"movd %%mm0, %0 \n"\
"movswl %w0, %0 \n"\
:"=r"(H)\
:"m"(MEM_FIX(&src[-FDEC_STRIDE], const pixel, 8)),\
"m"(MEM_FIX(pb_m32101234, const int8_t, 8))\
:"mm0", "mm1"\
);
#endif // HIGH_BIT_DEPTH
#define PREDICT_8x8C_P_CORE_INLINE\
int H, V;\
PREDICT_8x8C_P_ASM\
V = 1 * ( src[4*FDEC_STRIDE -1] - src[ 2*FDEC_STRIDE -1] )\
+ 2 * ( src[5*FDEC_STRIDE -1] - src[ 1*FDEC_STRIDE -1] )\
+ 3 * ( src[6*FDEC_STRIDE -1] - src[ 0*FDEC_STRIDE -1] )\
+ 4 * ( src[7*FDEC_STRIDE -1] - src[-1*FDEC_STRIDE -1] );\
H += -4 * src[-1*FDEC_STRIDE -1];
#define PREDICT_8x8C_P_INLINE(name, name2)\
static void predict_8x8c_p_##name( pixel *src )\
{\
PREDICT_8x8C_P_CORE_INLINE\
PREDICT_8x8C_P_END(name2)\
}
#else // !HAVE_X86_INLINE_ASM
#define PREDICT_8x8C_P_INLINE(name, name2) PREDICT_8x8C_P(name, name2)
#endif // HAVE_X86_INLINE_ASM
#if HIGH_BIT_DEPTH
PREDICT_8x8C_P_INLINE( sse2, sse2 )
#else //!HIGH_BIT_DEPTH
#if !ARCH_X86_64
PREDICT_8x8C_P( mmx2, mmx2 )
#endif // !ARCH_X86_64
PREDICT_8x8C_P( sse2, sse2 )
#if HAVE_X86_INLINE_ASM
PREDICT_8x8C_P_INLINE( ssse3, sse2 )
#endif // HAVE_X86_INLINE_ASM
#endif // HIGH_BIT_DEPTH
PREDICT_8x8C_P_INLINE( avx, avx )
PREDICT_8x8C_P_INLINE( avx2, avx2 )
#if ARCH_X86_64 && !HIGH_BIT_DEPTH
static void predict_8x8c_dc_left( uint8_t *src )
{
int y;
uint32_t s0 = 0, s1 = 0;
uint64_t dc0, dc1;
for( y = 0; y < 4; y++ )
{
s0 += src[y * FDEC_STRIDE - 1];
s1 += src[(y+4) * FDEC_STRIDE - 1];
}
dc0 = (( s0 + 2 ) >> 2) * 0x0101010101010101ULL;
dc1 = (( s1 + 2 ) >> 2) * 0x0101010101010101ULL;
for( y = 0; y < 4; y++ )
{
M64( src ) = dc0;
src += FDEC_STRIDE;
}
for( y = 0; y < 4; y++ )
{
M64( src ) = dc1;
src += FDEC_STRIDE;
}
}
#endif // ARCH_X86_64 && !HIGH_BIT_DEPTH
/****************************************************************************
* Exported functions:
****************************************************************************/
void x264_predict_16x16_init_mmx( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_MMX2) )
return;
pf[I_PRED_16x16_V] = x264_predict_16x16_v_mmx2;
pf[I_PRED_16x16_H] = x264_predict_16x16_h_mmx2;
#if HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSE) )
return;
pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
pf[I_PRED_16x16_H] = x264_predict_16x16_h_sse2;
pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_16x16_V] = x264_predict_16x16_v_avx;
if( !(cpu&X264_CPU_AVX2) )
return;
pf[I_PRED_16x16_H] = x264_predict_16x16_h_avx2;
#else
#if !ARCH_X86_64
pf[I_PRED_16x16_P] = predict_16x16_p_mmx2;
#endif
if( !(cpu&X264_CPU_SSE) )
return;
pf[I_PRED_16x16_V] = x264_predict_16x16_v_sse;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_sse2;
pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_sse2;
pf[I_PRED_16x16_P] = predict_16x16_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
if( !(cpu&X264_CPU_SLOW_PSHUFB) )
pf[I_PRED_16x16_H] = x264_predict_16x16_h_ssse3;
#if HAVE_X86_INLINE_ASM
pf[I_PRED_16x16_P] = predict_16x16_p_ssse3;
#endif
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_16x16_P] = predict_16x16_p_avx;
#endif // HIGH_BIT_DEPTH
if( cpu&X264_CPU_AVX2 )
{
pf[I_PRED_16x16_P] = predict_16x16_p_avx2;
pf[I_PRED_16x16_DC] = x264_predict_16x16_dc_avx2;
pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_avx2;
pf[I_PRED_16x16_DC_LEFT] = x264_predict_16x16_dc_left_avx2;
}
}
void x264_predict_8x8c_init_mmx( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_MMX) )
return;
#if HIGH_BIT_DEPTH
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx;
if( !(cpu&X264_CPU_MMX2) )
return;
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2;
if( !(cpu&X264_CPU_SSE) )
return;
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_sse;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_sse2;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_sse2;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_sse2;
pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2;
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_CHROMA_P] = predict_8x8c_p_avx;
if( !(cpu&X264_CPU_AVX2) )
return;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_avx2;
#else
#if ARCH_X86_64
pf[I_PRED_CHROMA_DC_LEFT] = predict_8x8c_dc_left;
#endif
pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_mmx;
if( !(cpu&X264_CPU_MMX2) )
return;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_mmx2;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_mmx2;
#if !ARCH_X86_64
pf[I_PRED_CHROMA_P] = predict_8x8c_p_mmx2;
#endif
pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_mmx2;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_CHROMA_P] = predict_8x8c_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_ssse3;
#if HAVE_X86_INLINE_ASM
pf[I_PRED_CHROMA_P] = predict_8x8c_p_ssse3;
#endif
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_CHROMA_P] = predict_8x8c_p_avx;
#endif // HIGH_BIT_DEPTH
if( cpu&X264_CPU_AVX2 )
{
pf[I_PRED_CHROMA_P] = predict_8x8c_p_avx2;
}
}
void x264_predict_8x16c_init_mmx( uint32_t cpu, x264_predict_t pf[7] )
{
if( !(cpu&X264_CPU_MMX) )
return;
#if HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_MMX2) )
return;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
if( !(cpu&X264_CPU_SSE) )
return;
pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_sse;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_sse2;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_sse2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_sse2;
pf[I_PRED_CHROMA_P] = predict_8x16c_p_sse2;
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_CHROMA_P] = predict_8x16c_p_avx;
if( !(cpu&X264_CPU_AVX2) )
return;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_avx2;
#else
pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_mmx;
if( !(cpu&X264_CPU_MMX2) )
return;
pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_mmx2;
pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_mmx2;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_mmx2;
#if !ARCH_X86_64
pf[I_PRED_CHROMA_P] = predict_8x16c_p_mmx2;
#endif
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_CHROMA_P] = predict_8x16c_p_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_ssse3;
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_CHROMA_P] = predict_8x16c_p_avx;
#endif // HIGH_BIT_DEPTH
if( cpu&X264_CPU_AVX2 )
{
pf[I_PRED_CHROMA_P] = predict_8x16c_p_avx2;
}
}
void x264_predict_8x8_init_mmx( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
{
if( !(cpu&X264_CPU_MMX2) )
return;
#if HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSE) )
return;
pf[I_PRED_8x8_V] = x264_predict_8x8_v_sse;
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_sse2;
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_sse2;
pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_sse2;
pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_sse2;
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
*predict_8x8_filter = x264_predict_8x8_filter_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_ssse3;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_ssse3;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_ssse3;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_cache64_ssse3;
pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_cache64_ssse3;
}
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_avx;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
*predict_8x8_filter = x264_predict_8x8_filter_avx;
#else
pf[I_PRED_8x8_V] = x264_predict_8x8_v_mmx2;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_mmx2;
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_mmx2;
pf[I_PRED_8x8_DC_TOP] = x264_predict_8x8_dc_top_mmx2;
pf[I_PRED_8x8_DC_LEFT]= x264_predict_8x8_dc_left_mmx2;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_mmx2;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_mmx2;
*predict_8x8_filter = x264_predict_8x8_filter_mmx2;
#if ARCH_X86
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_mmx2;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_mmx2;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_mmx2;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_mmx2;
#endif
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_sse2;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_sse2;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_sse2;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_sse2;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_sse2;
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
if( !(cpu&X264_CPU_SLOW_PALIGNR) )
{
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_ssse3;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_ssse3;
}
pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_ssse3;
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_avx;
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_avx;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
#endif // HIGH_BIT_DEPTH
}
void x264_predict_4x4_init_mmx( uint32_t cpu, x264_predict_t pf[12] )
{
if( !(cpu&X264_CPU_MMX2) )
return;
pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_mmx2;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmx2;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_mmx2;
pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_mmx2;
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_mmx2;
pf[I_PRED_4x4_HU] = x264_predict_4x4_hu_mmx2;
#if HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_SSE2) )
return;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_sse2;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_sse2;
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_sse2;
pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_sse2;
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_sse2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
if( !(cpu&X264_CPU_AVX) )
return;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_avx;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_avx;
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_avx;
pf[I_PRED_4x4_VL] = x264_predict_4x4_vl_avx;
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_avx;
if( !(cpu&X264_CPU_AVX2) )
return;
pf[I_PRED_4x4_H] = x264_predict_4x4_h_avx2;
#else
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_mmx2;
if( !(cpu&X264_CPU_SSSE3) )
return;
pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_cache64_ssse3;
#endif // HIGH_BIT_DEPTH
}

256
common/x86/predict.h Normal file
View File

@@ -0,0 +1,256 @@
/*****************************************************************************
* predict.h: x86 intra prediction
*****************************************************************************
* Copyright (C) 2003-2025 x264 project
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_X86_PREDICT_H
#define X264_X86_PREDICT_H
#define x264_predict_16x16_init_mmx x264_template(predict_16x16_init_mmx)
void x264_predict_16x16_init_mmx( uint32_t cpu, x264_predict_t pf[7] );
#define x264_predict_8x16c_init_mmx x264_template(predict_8x16c_init_mmx)
void x264_predict_8x16c_init_mmx( uint32_t cpu, x264_predict_t pf[7] );
#define x264_predict_8x8c_init_mmx x264_template(predict_8x8c_init_mmx)
void x264_predict_8x8c_init_mmx ( uint32_t cpu, x264_predict_t pf[7] );
#define x264_predict_4x4_init_mmx x264_template(predict_4x4_init_mmx)
void x264_predict_4x4_init_mmx ( uint32_t cpu, x264_predict_t pf[12] );
#define x264_predict_8x8_init_mmx x264_template(predict_8x8_init_mmx)
void x264_predict_8x8_init_mmx ( uint32_t cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter );
#define x264_predict_16x16_v_mmx2 x264_template(predict_16x16_v_mmx2)
void x264_predict_16x16_v_mmx2( pixel *src );
#define x264_predict_16x16_v_sse x264_template(predict_16x16_v_sse)
void x264_predict_16x16_v_sse ( pixel *src );
#define x264_predict_16x16_v_avx x264_template(predict_16x16_v_avx)
void x264_predict_16x16_v_avx ( uint16_t *src );
#define x264_predict_16x16_h_mmx2 x264_template(predict_16x16_h_mmx2)
void x264_predict_16x16_h_mmx2( pixel *src );
#define x264_predict_16x16_h_sse2 x264_template(predict_16x16_h_sse2)
void x264_predict_16x16_h_sse2( uint16_t *src );
#define x264_predict_16x16_h_ssse3 x264_template(predict_16x16_h_ssse3)
void x264_predict_16x16_h_ssse3( uint8_t *src );
#define x264_predict_16x16_h_avx2 x264_template(predict_16x16_h_avx2)
void x264_predict_16x16_h_avx2( uint16_t *src );
#define x264_predict_16x16_dc_sse2 x264_template(predict_16x16_dc_sse2)
void x264_predict_16x16_dc_sse2( pixel *src );
#define x264_predict_16x16_dc_avx2 x264_template(predict_16x16_dc_avx2)
void x264_predict_16x16_dc_avx2( pixel *src );
#define x264_predict_16x16_dc_left_sse2 x264_template(predict_16x16_dc_left_sse2)
void x264_predict_16x16_dc_left_sse2( pixel *src );
#define x264_predict_16x16_dc_left_avx2 x264_template(predict_16x16_dc_left_avx2)
void x264_predict_16x16_dc_left_avx2( pixel *src );
#define x264_predict_16x16_dc_top_sse2 x264_template(predict_16x16_dc_top_sse2)
void x264_predict_16x16_dc_top_sse2( pixel *src );
#define x264_predict_16x16_dc_top_avx2 x264_template(predict_16x16_dc_top_avx2)
void x264_predict_16x16_dc_top_avx2( pixel *src );
#define x264_predict_16x16_p_core_mmx2 x264_template(predict_16x16_p_core_mmx2)
void x264_predict_16x16_p_core_mmx2( uint8_t *src, int i00, int b, int c );
#define x264_predict_16x16_p_core_sse2 x264_template(predict_16x16_p_core_sse2)
void x264_predict_16x16_p_core_sse2( pixel *src, int i00, int b, int c );
#define x264_predict_16x16_p_core_avx x264_template(predict_16x16_p_core_avx)
void x264_predict_16x16_p_core_avx( pixel *src, int i00, int b, int c );
#define x264_predict_16x16_p_core_avx2 x264_template(predict_16x16_p_core_avx2)
void x264_predict_16x16_p_core_avx2( pixel *src, int i00, int b, int c );
#define x264_predict_8x16c_dc_mmx2 x264_template(predict_8x16c_dc_mmx2)
void x264_predict_8x16c_dc_mmx2( pixel *src );
#define x264_predict_8x16c_dc_sse2 x264_template(predict_8x16c_dc_sse2)
void x264_predict_8x16c_dc_sse2( uint16_t *src );
#define x264_predict_8x16c_dc_top_mmx2 x264_template(predict_8x16c_dc_top_mmx2)
void x264_predict_8x16c_dc_top_mmx2( uint8_t *src );
#define x264_predict_8x16c_dc_top_sse2 x264_template(predict_8x16c_dc_top_sse2)
void x264_predict_8x16c_dc_top_sse2( uint16_t *src );
#define x264_predict_8x16c_v_mmx x264_template(predict_8x16c_v_mmx)
void x264_predict_8x16c_v_mmx( uint8_t *src );
#define x264_predict_8x16c_v_sse x264_template(predict_8x16c_v_sse)
void x264_predict_8x16c_v_sse( uint16_t *src );
#define x264_predict_8x16c_h_mmx2 x264_template(predict_8x16c_h_mmx2)
void x264_predict_8x16c_h_mmx2( pixel *src );
#define x264_predict_8x16c_h_sse2 x264_template(predict_8x16c_h_sse2)
void x264_predict_8x16c_h_sse2( uint16_t *src );
#define x264_predict_8x16c_h_ssse3 x264_template(predict_8x16c_h_ssse3)
void x264_predict_8x16c_h_ssse3( uint8_t *src );
#define x264_predict_8x16c_h_avx2 x264_template(predict_8x16c_h_avx2)
void x264_predict_8x16c_h_avx2( uint16_t *src );
#define x264_predict_8x16c_p_core_mmx2 x264_template(predict_8x16c_p_core_mmx2)
void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
#define x264_predict_8x16c_p_core_sse2 x264_template(predict_8x16c_p_core_sse2)
void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c );
#define x264_predict_8x16c_p_core_avx x264_template(predict_8x16c_p_core_avx)
void x264_predict_8x16c_p_core_avx ( pixel *src, int i00, int b, int c );
#define x264_predict_8x16c_p_core_avx2 x264_template(predict_8x16c_p_core_avx2)
void x264_predict_8x16c_p_core_avx2( pixel *src, int i00, int b, int c );
#define x264_predict_8x8c_p_core_mmx2 x264_template(predict_8x8c_p_core_mmx2)
void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
#define x264_predict_8x8c_p_core_sse2 x264_template(predict_8x8c_p_core_sse2)
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
#define x264_predict_8x8c_p_core_avx x264_template(predict_8x8c_p_core_avx)
void x264_predict_8x8c_p_core_avx ( pixel *src, int i00, int b, int c );
#define x264_predict_8x8c_p_core_avx2 x264_template(predict_8x8c_p_core_avx2)
void x264_predict_8x8c_p_core_avx2( pixel *src, int i00, int b, int c );
#define x264_predict_8x8c_dc_mmx2 x264_template(predict_8x8c_dc_mmx2)
void x264_predict_8x8c_dc_mmx2( pixel *src );
#define x264_predict_8x8c_dc_sse2 x264_template(predict_8x8c_dc_sse2)
void x264_predict_8x8c_dc_sse2( uint16_t *src );
#define x264_predict_8x8c_dc_top_mmx2 x264_template(predict_8x8c_dc_top_mmx2)
void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
#define x264_predict_8x8c_dc_top_sse2 x264_template(predict_8x8c_dc_top_sse2)
void x264_predict_8x8c_dc_top_sse2( uint16_t *src );
#define x264_predict_8x8c_v_mmx x264_template(predict_8x8c_v_mmx)
void x264_predict_8x8c_v_mmx( pixel *src );
#define x264_predict_8x8c_v_sse x264_template(predict_8x8c_v_sse)
void x264_predict_8x8c_v_sse( uint16_t *src );
#define x264_predict_8x8c_h_mmx2 x264_template(predict_8x8c_h_mmx2)
void x264_predict_8x8c_h_mmx2( pixel *src );
#define x264_predict_8x8c_h_sse2 x264_template(predict_8x8c_h_sse2)
void x264_predict_8x8c_h_sse2( uint16_t *src );
#define x264_predict_8x8c_h_ssse3 x264_template(predict_8x8c_h_ssse3)
void x264_predict_8x8c_h_ssse3( uint8_t *src );
#define x264_predict_8x8c_h_avx2 x264_template(predict_8x8c_h_avx2)
void x264_predict_8x8c_h_avx2( uint16_t *src );
#define x264_predict_8x8_v_mmx2 x264_template(predict_8x8_v_mmx2)
void x264_predict_8x8_v_mmx2( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_v_sse x264_template(predict_8x8_v_sse)
void x264_predict_8x8_v_sse ( uint16_t *src, uint16_t edge[36] );
#define x264_predict_8x8_h_mmx2 x264_template(predict_8x8_h_mmx2)
void x264_predict_8x8_h_mmx2( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_h_sse2 x264_template(predict_8x8_h_sse2)
void x264_predict_8x8_h_sse2( uint16_t *src, uint16_t edge[36] );
#define x264_predict_8x8_hd_mmx2 x264_template(predict_8x8_hd_mmx2)
void x264_predict_8x8_hd_mmx2( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_hu_mmx2 x264_template(predict_8x8_hu_mmx2)
void x264_predict_8x8_hu_mmx2( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_dc_mmx2 x264_template(predict_8x8_dc_mmx2)
void x264_predict_8x8_dc_mmx2( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_dc_sse2 x264_template(predict_8x8_dc_sse2)
void x264_predict_8x8_dc_sse2( uint16_t *src, uint16_t edge[36] );
#define x264_predict_8x8_dc_top_mmx2 x264_template(predict_8x8_dc_top_mmx2)
void x264_predict_8x8_dc_top_mmx2( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_dc_top_sse2 x264_template(predict_8x8_dc_top_sse2)
void x264_predict_8x8_dc_top_sse2( uint16_t *src, uint16_t edge[36] );
#define x264_predict_8x8_dc_left_mmx2 x264_template(predict_8x8_dc_left_mmx2)
void x264_predict_8x8_dc_left_mmx2( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_dc_left_sse2 x264_template(predict_8x8_dc_left_sse2)
void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] );
#define x264_predict_8x8_ddl_mmx2 x264_template(predict_8x8_ddl_mmx2)
void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_ddl_sse2 x264_template(predict_8x8_ddl_sse2)
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] );
#define x264_predict_8x8_ddl_ssse3 x264_template(predict_8x8_ddl_ssse3)
void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] );
#define x264_predict_8x8_ddl_cache64_ssse3 x264_template(predict_8x8_ddl_cache64_ssse3)
void x264_predict_8x8_ddl_cache64_ssse3( pixel *src, pixel edge[36] );
#define x264_predict_8x8_ddl_avx x264_template(predict_8x8_ddl_avx)
void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_ddr_mmx2 x264_template(predict_8x8_ddr_mmx2)
void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_ddr_sse2 x264_template(predict_8x8_ddr_sse2)
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] );
#define x264_predict_8x8_ddr_ssse3 x264_template(predict_8x8_ddr_ssse3)
void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] );
#define x264_predict_8x8_ddr_cache64_ssse3 x264_template(predict_8x8_ddr_cache64_ssse3)
void x264_predict_8x8_ddr_cache64_ssse3( pixel *src, pixel edge[36] );
#define x264_predict_8x8_ddr_avx x264_template(predict_8x8_ddr_avx)
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_vl_sse2 x264_template(predict_8x8_vl_sse2)
void x264_predict_8x8_vl_sse2( pixel *src, pixel edge[36] );
#define x264_predict_8x8_vl_ssse3 x264_template(predict_8x8_vl_ssse3)
void x264_predict_8x8_vl_ssse3( pixel *src, pixel edge[36] );
#define x264_predict_8x8_vl_avx x264_template(predict_8x8_vl_avx)
void x264_predict_8x8_vl_avx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_vl_mmx2 x264_template(predict_8x8_vl_mmx2)
void x264_predict_8x8_vl_mmx2( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_vr_mmx2 x264_template(predict_8x8_vr_mmx2)
void x264_predict_8x8_vr_mmx2( uint8_t *src, uint8_t edge[36] );
#define x264_predict_8x8_vr_sse2 x264_template(predict_8x8_vr_sse2)
void x264_predict_8x8_vr_sse2( pixel *src, pixel edge[36] );
#define x264_predict_8x8_vr_ssse3 x264_template(predict_8x8_vr_ssse3)
void x264_predict_8x8_vr_ssse3( pixel *src, pixel edge[36] );
#define x264_predict_8x8_vr_avx x264_template(predict_8x8_vr_avx)
void x264_predict_8x8_vr_avx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_hu_sse2 x264_template(predict_8x8_hu_sse2)
void x264_predict_8x8_hu_sse2( pixel *src, pixel edge[36] );
#define x264_predict_8x8_hu_ssse3 x264_template(predict_8x8_hu_ssse3)
void x264_predict_8x8_hu_ssse3( pixel *src, pixel edge[36] );
#define x264_predict_8x8_hu_avx x264_template(predict_8x8_hu_avx)
void x264_predict_8x8_hu_avx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_hd_sse2 x264_template(predict_8x8_hd_sse2)
void x264_predict_8x8_hd_sse2( pixel *src, pixel edge[36] );
#define x264_predict_8x8_hd_ssse3 x264_template(predict_8x8_hd_ssse3)
void x264_predict_8x8_hd_ssse3( pixel *src, pixel edge[36] );
#define x264_predict_8x8_hd_avx x264_template(predict_8x8_hd_avx)
void x264_predict_8x8_hd_avx( pixel *src, pixel edge[36] );
#define x264_predict_8x8_filter_mmx2 x264_template(predict_8x8_filter_mmx2)
void x264_predict_8x8_filter_mmx2( uint8_t *src, uint8_t edge[36], int i_neighbor, int i_filters );
#define x264_predict_8x8_filter_sse2 x264_template(predict_8x8_filter_sse2)
void x264_predict_8x8_filter_sse2( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
#define x264_predict_8x8_filter_ssse3 x264_template(predict_8x8_filter_ssse3)
void x264_predict_8x8_filter_ssse3( pixel *src, pixel edge[36], int i_neighbor, int i_filters );
#define x264_predict_8x8_filter_avx x264_template(predict_8x8_filter_avx)
void x264_predict_8x8_filter_avx( uint16_t *src, uint16_t edge[36], int i_neighbor, int i_filters );
#define x264_predict_4x4_h_avx2 x264_template(predict_4x4_h_avx2)
void x264_predict_4x4_h_avx2( uint16_t *src );
#define x264_predict_4x4_ddl_mmx2 x264_template(predict_4x4_ddl_mmx2)
void x264_predict_4x4_ddl_mmx2( pixel *src );
#define x264_predict_4x4_ddl_sse2 x264_template(predict_4x4_ddl_sse2)
void x264_predict_4x4_ddl_sse2( uint16_t *src );
#define x264_predict_4x4_ddl_avx x264_template(predict_4x4_ddl_avx)
void x264_predict_4x4_ddl_avx( uint16_t *src );
#define x264_predict_4x4_ddr_mmx2 x264_template(predict_4x4_ddr_mmx2)
void x264_predict_4x4_ddr_mmx2( pixel *src );
#define x264_predict_4x4_vl_mmx2 x264_template(predict_4x4_vl_mmx2)
void x264_predict_4x4_vl_mmx2( pixel *src );
#define x264_predict_4x4_vl_sse2 x264_template(predict_4x4_vl_sse2)
void x264_predict_4x4_vl_sse2( uint16_t *src );
#define x264_predict_4x4_vl_avx x264_template(predict_4x4_vl_avx)
void x264_predict_4x4_vl_avx( uint16_t *src );
#define x264_predict_4x4_vr_mmx2 x264_template(predict_4x4_vr_mmx2)
void x264_predict_4x4_vr_mmx2( uint8_t *src );
#define x264_predict_4x4_vr_sse2 x264_template(predict_4x4_vr_sse2)
void x264_predict_4x4_vr_sse2( uint16_t *src );
#define x264_predict_4x4_vr_ssse3 x264_template(predict_4x4_vr_ssse3)
void x264_predict_4x4_vr_ssse3( pixel *src );
#define x264_predict_4x4_vr_cache64_ssse3 x264_template(predict_4x4_vr_cache64_ssse3)
void x264_predict_4x4_vr_cache64_ssse3( uint8_t *src );
#define x264_predict_4x4_vr_avx x264_template(predict_4x4_vr_avx)
void x264_predict_4x4_vr_avx( uint16_t *src );
#define x264_predict_4x4_hd_mmx2 x264_template(predict_4x4_hd_mmx2)
void x264_predict_4x4_hd_mmx2( pixel *src );
#define x264_predict_4x4_hd_sse2 x264_template(predict_4x4_hd_sse2)
void x264_predict_4x4_hd_sse2( uint16_t *src );
#define x264_predict_4x4_hd_ssse3 x264_template(predict_4x4_hd_ssse3)
void x264_predict_4x4_hd_ssse3( pixel *src );
#define x264_predict_4x4_hd_avx x264_template(predict_4x4_hd_avx)
void x264_predict_4x4_hd_avx( uint16_t *src );
#define x264_predict_4x4_dc_mmx2 x264_template(predict_4x4_dc_mmx2)
void x264_predict_4x4_dc_mmx2( pixel *src );
#define x264_predict_4x4_ddr_sse2 x264_template(predict_4x4_ddr_sse2)
void x264_predict_4x4_ddr_sse2( uint16_t *src );
#define x264_predict_4x4_ddr_ssse3 x264_template(predict_4x4_ddr_ssse3)
void x264_predict_4x4_ddr_ssse3( pixel *src );
#define x264_predict_4x4_ddr_avx x264_template(predict_4x4_ddr_avx)
void x264_predict_4x4_ddr_avx( uint16_t *src );
#define x264_predict_4x4_hu_mmx2 x264_template(predict_4x4_hu_mmx2)
void x264_predict_4x4_hu_mmx2( pixel *src );
#endif

2269
common/x86/quant-a.asm Normal file

File diff suppressed because it is too large Load Diff

278
common/x86/quant.h Normal file
View File

@@ -0,0 +1,278 @@
/*****************************************************************************
* quant.h: x86 quantization and level-run
*****************************************************************************
* Copyright (C) 2005-2025 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
* Christian Heine <sennindemokrit@gmx.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_X86_QUANT_H
#define X264_X86_QUANT_H
#define x264_quant_2x2_dc_mmx2 x264_template(quant_2x2_dc_mmx2)
int x264_quant_2x2_dc_mmx2( dctcoef dct[4], int mf, int bias );
#define x264_quant_4x4_dc_mmx2 x264_template(quant_4x4_dc_mmx2)
int x264_quant_4x4_dc_mmx2( dctcoef dct[16], int mf, int bias );
#define x264_quant_4x4_mmx2 x264_template(quant_4x4_mmx2)
int x264_quant_4x4_mmx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_8x8_mmx2 x264_template(quant_8x8_mmx2)
int x264_quant_8x8_mmx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
#define x264_quant_2x2_dc_sse2 x264_template(quant_2x2_dc_sse2)
int x264_quant_2x2_dc_sse2( dctcoef dct[16], int mf, int bias );
#define x264_quant_4x4_dc_sse2 x264_template(quant_4x4_dc_sse2)
int x264_quant_4x4_dc_sse2( dctcoef dct[16], int mf, int bias );
#define x264_quant_4x4_sse2 x264_template(quant_4x4_sse2)
int x264_quant_4x4_sse2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_4x4x4_sse2 x264_template(quant_4x4x4_sse2)
int x264_quant_4x4x4_sse2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_8x8_sse2 x264_template(quant_8x8_sse2)
int x264_quant_8x8_sse2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
#define x264_quant_2x2_dc_ssse3 x264_template(quant_2x2_dc_ssse3)
int x264_quant_2x2_dc_ssse3( dctcoef dct[4], int mf, int bias );
#define x264_quant_4x4_dc_ssse3 x264_template(quant_4x4_dc_ssse3)
int x264_quant_4x4_dc_ssse3( dctcoef dct[16], int mf, int bias );
#define x264_quant_4x4_ssse3 x264_template(quant_4x4_ssse3)
int x264_quant_4x4_ssse3( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_4x4x4_ssse3 x264_template(quant_4x4x4_ssse3)
int x264_quant_4x4x4_ssse3( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_8x8_ssse3 x264_template(quant_8x8_ssse3)
int x264_quant_8x8_ssse3( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
#define x264_quant_2x2_dc_sse4 x264_template(quant_2x2_dc_sse4)
int x264_quant_2x2_dc_sse4( dctcoef dct[16], int mf, int bias );
#define x264_quant_4x4_dc_sse4 x264_template(quant_4x4_dc_sse4)
int x264_quant_4x4_dc_sse4( dctcoef dct[16], int mf, int bias );
#define x264_quant_4x4_sse4 x264_template(quant_4x4_sse4)
int x264_quant_4x4_sse4( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_4x4x4_sse4 x264_template(quant_4x4x4_sse4)
int x264_quant_4x4x4_sse4( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_8x8_sse4 x264_template(quant_8x8_sse4)
int x264_quant_8x8_sse4( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
#define x264_quant_4x4_avx2 x264_template(quant_4x4_avx2)
int x264_quant_4x4_avx2( dctcoef dct[16], udctcoef mf[16], udctcoef bias[16] );
#define x264_quant_4x4_dc_avx2 x264_template(quant_4x4_dc_avx2)
int x264_quant_4x4_dc_avx2( dctcoef dct[16], int mf, int bias );
#define x264_quant_8x8_avx2 x264_template(quant_8x8_avx2)
int x264_quant_8x8_avx2( dctcoef dct[64], udctcoef mf[64], udctcoef bias[64] );
#define x264_quant_4x4x4_avx2 x264_template(quant_4x4x4_avx2)
int x264_quant_4x4x4_avx2( dctcoef dct[4][16], udctcoef mf[16], udctcoef bias[16] );
#define x264_dequant_4x4_mmx x264_template(dequant_4x4_mmx)
void x264_dequant_4x4_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_4x4dc_mmx2 x264_template(dequant_4x4dc_mmx2)
void x264_dequant_4x4dc_mmx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_mmx x264_template(dequant_8x8_mmx)
void x264_dequant_8x8_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_sse2 x264_template(dequant_4x4_sse2)
void x264_dequant_4x4_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_4x4dc_sse2 x264_template(dequant_4x4dc_sse2)
void x264_dequant_4x4dc_sse2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_sse2 x264_template(dequant_8x8_sse2)
void x264_dequant_8x8_sse2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_avx x264_template(dequant_4x4_avx)
void x264_dequant_4x4_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_4x4dc_avx x264_template(dequant_4x4dc_avx)
void x264_dequant_4x4dc_avx( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_avx x264_template(dequant_8x8_avx)
void x264_dequant_8x8_avx( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_xop x264_template(dequant_4x4_xop)
void x264_dequant_4x4_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_4x4dc_xop x264_template(dequant_4x4dc_xop)
void x264_dequant_4x4dc_xop( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_xop x264_template(dequant_8x8_xop)
void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_avx2 x264_template(dequant_4x4_avx2)
void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_4x4dc_avx2 x264_template(dequant_4x4dc_avx2)
void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_avx2 x264_template(dequant_8x8_avx2)
void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_avx512 x264_template(dequant_4x4_avx512)
void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_avx512 x264_template(dequant_8x8_avx512)
void x264_dequant_8x8_avx512( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_flat16_mmx x264_template(dequant_4x4_flat16_mmx)
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_flat16_mmx x264_template(dequant_8x8_flat16_mmx)
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_flat16_sse2 x264_template(dequant_4x4_flat16_sse2)
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_flat16_sse2 x264_template(dequant_8x8_flat16_sse2)
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_4x4_flat16_avx2 x264_template(dequant_4x4_flat16_avx2)
void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
#define x264_dequant_8x8_flat16_avx2 x264_template(dequant_8x8_flat16_avx2)
void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
#define x264_dequant_8x8_flat16_avx512 x264_template(dequant_8x8_flat16_avx512)
void x264_dequant_8x8_flat16_avx512( int16_t dct[64], int dequant_mf[6][64], int i_qp );
#define x264_idct_dequant_2x4_dc_sse2 x264_template(idct_dequant_2x4_dc_sse2)
void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
#define x264_idct_dequant_2x4_dc_avx x264_template(idct_dequant_2x4_dc_avx)
void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
#define x264_idct_dequant_2x4_dconly_sse2 x264_template(idct_dequant_2x4_dconly_sse2)
void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
#define x264_idct_dequant_2x4_dconly_avx x264_template(idct_dequant_2x4_dconly_avx)
void x264_idct_dequant_2x4_dconly_avx ( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
#define x264_optimize_chroma_2x2_dc_sse2 x264_template(optimize_chroma_2x2_dc_sse2)
int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
#define x264_optimize_chroma_2x2_dc_ssse3 x264_template(optimize_chroma_2x2_dc_ssse3)
int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
#define x264_optimize_chroma_2x2_dc_sse4 x264_template(optimize_chroma_2x2_dc_sse4)
int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
#define x264_optimize_chroma_2x2_dc_avx x264_template(optimize_chroma_2x2_dc_avx)
int x264_optimize_chroma_2x2_dc_avx( dctcoef dct[4], int dequant_mf );
#define x264_denoise_dct_mmx x264_template(denoise_dct_mmx)
void x264_denoise_dct_mmx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
#define x264_denoise_dct_sse2 x264_template(denoise_dct_sse2)
void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
#define x264_denoise_dct_ssse3 x264_template(denoise_dct_ssse3)
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
#define x264_denoise_dct_avx x264_template(denoise_dct_avx)
void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
#define x264_denoise_dct_avx2 x264_template(denoise_dct_avx2)
void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
#define x264_decimate_score15_sse2 x264_template(decimate_score15_sse2)
int x264_decimate_score15_sse2( dctcoef *dct );
#define x264_decimate_score15_ssse3 x264_template(decimate_score15_ssse3)
int x264_decimate_score15_ssse3( dctcoef *dct );
#define x264_decimate_score15_avx512 x264_template(decimate_score15_avx512)
int x264_decimate_score15_avx512( dctcoef *dct );
#define x264_decimate_score16_sse2 x264_template(decimate_score16_sse2)
int x264_decimate_score16_sse2( dctcoef *dct );
#define x264_decimate_score16_ssse3 x264_template(decimate_score16_ssse3)
int x264_decimate_score16_ssse3( dctcoef *dct );
#define x264_decimate_score16_avx512 x264_template(decimate_score16_avx512)
int x264_decimate_score16_avx512( dctcoef *dct );
#define x264_decimate_score64_sse2 x264_template(decimate_score64_sse2)
int x264_decimate_score64_sse2( dctcoef *dct );
#define x264_decimate_score64_ssse3 x264_template(decimate_score64_ssse3)
int x264_decimate_score64_ssse3( dctcoef *dct );
#define x264_decimate_score64_avx2 x264_template(decimate_score64_avx2)
int x264_decimate_score64_avx2( int16_t *dct );
#define x264_decimate_score64_avx512 x264_template(decimate_score64_avx512)
int x264_decimate_score64_avx512( dctcoef *dct );
#define x264_coeff_last4_mmx2 x264_template(coeff_last4_mmx2)
int x264_coeff_last4_mmx2( dctcoef *dct );
#define x264_coeff_last8_mmx2 x264_template(coeff_last8_mmx2)
int x264_coeff_last8_mmx2( dctcoef *dct );
#define x264_coeff_last15_mmx2 x264_template(coeff_last15_mmx2)
int x264_coeff_last15_mmx2( dctcoef *dct );
#define x264_coeff_last16_mmx2 x264_template(coeff_last16_mmx2)
int x264_coeff_last16_mmx2( dctcoef *dct );
#define x264_coeff_last64_mmx2 x264_template(coeff_last64_mmx2)
int x264_coeff_last64_mmx2( dctcoef *dct );
#define x264_coeff_last8_sse2 x264_template(coeff_last8_sse2)
int x264_coeff_last8_sse2( dctcoef *dct );
#define x264_coeff_last15_sse2 x264_template(coeff_last15_sse2)
int x264_coeff_last15_sse2( dctcoef *dct );
#define x264_coeff_last16_sse2 x264_template(coeff_last16_sse2)
int x264_coeff_last16_sse2( dctcoef *dct );
#define x264_coeff_last64_sse2 x264_template(coeff_last64_sse2)
int x264_coeff_last64_sse2( dctcoef *dct );
#define x264_coeff_last4_lzcnt x264_template(coeff_last4_lzcnt)
int x264_coeff_last4_lzcnt( dctcoef *dct );
#define x264_coeff_last8_lzcnt x264_template(coeff_last8_lzcnt)
int x264_coeff_last8_lzcnt( dctcoef *dct );
#define x264_coeff_last15_lzcnt x264_template(coeff_last15_lzcnt)
int x264_coeff_last15_lzcnt( dctcoef *dct );
#define x264_coeff_last16_lzcnt x264_template(coeff_last16_lzcnt)
int x264_coeff_last16_lzcnt( dctcoef *dct );
#define x264_coeff_last64_lzcnt x264_template(coeff_last64_lzcnt)
int x264_coeff_last64_lzcnt( dctcoef *dct );
#define x264_coeff_last64_avx2 x264_template(coeff_last64_avx2)
int x264_coeff_last64_avx2 ( dctcoef *dct );
#define x264_coeff_last4_avx512 x264_template(coeff_last4_avx512)
int x264_coeff_last4_avx512( int32_t *dct );
#define x264_coeff_last8_avx512 x264_template(coeff_last8_avx512)
int x264_coeff_last8_avx512( dctcoef *dct );
#define x264_coeff_last15_avx512 x264_template(coeff_last15_avx512)
int x264_coeff_last15_avx512( dctcoef *dct );
#define x264_coeff_last16_avx512 x264_template(coeff_last16_avx512)
int x264_coeff_last16_avx512( dctcoef *dct );
#define x264_coeff_last64_avx512 x264_template(coeff_last64_avx512)
int x264_coeff_last64_avx512( dctcoef *dct );
#define x264_coeff_level_run16_mmx2 x264_template(coeff_level_run16_mmx2)
int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run16_sse2 x264_template(coeff_level_run16_sse2)
int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run16_lzcnt x264_template(coeff_level_run16_lzcnt)
int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run16_ssse3 x264_template(coeff_level_run16_ssse3)
int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run16_ssse3_lzcnt x264_template(coeff_level_run16_ssse3_lzcnt)
int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run16_avx2 x264_template(coeff_level_run16_avx2)
int x264_coeff_level_run16_avx2( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run15_mmx2 x264_template(coeff_level_run15_mmx2)
int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run15_sse2 x264_template(coeff_level_run15_sse2)
int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run15_lzcnt x264_template(coeff_level_run15_lzcnt)
int x264_coeff_level_run15_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run15_ssse3 x264_template(coeff_level_run15_ssse3)
int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run15_ssse3_lzcnt x264_template(coeff_level_run15_ssse3_lzcnt)
int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run15_avx2 x264_template(coeff_level_run15_avx2)
int x264_coeff_level_run15_avx2( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run4_mmx2 x264_template(coeff_level_run4_mmx2)
int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run4_lzcnt x264_template(coeff_level_run4_lzcnt)
int x264_coeff_level_run4_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run4_ssse3 x264_template(coeff_level_run4_ssse3)
int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run4_ssse3_lzcnt x264_template(coeff_level_run4_ssse3_lzcnt)
int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run8_mmx2 x264_template(coeff_level_run8_mmx2)
int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run8_lzcnt x264_template(coeff_level_run8_lzcnt)
int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run8_sse2 x264_template(coeff_level_run8_sse2)
int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run8_ssse3 x264_template(coeff_level_run8_ssse3)
int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_coeff_level_run8_ssse3_lzcnt x264_template(coeff_level_run8_ssse3_lzcnt)
int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
#define x264_trellis_cabac_4x4_sse2 x264_template(trellis_cabac_4x4_sse2)
int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );
#define x264_trellis_cabac_4x4_ssse3 x264_template(trellis_cabac_4x4_ssse3)
int x264_trellis_cabac_4x4_ssse3( TRELLIS_PARAMS, int b_ac );
#define x264_trellis_cabac_8x8_sse2 x264_template(trellis_cabac_8x8_sse2)
int x264_trellis_cabac_8x8_sse2 ( TRELLIS_PARAMS, int b_interlaced );
#define x264_trellis_cabac_8x8_ssse3 x264_template(trellis_cabac_8x8_ssse3)
int x264_trellis_cabac_8x8_ssse3( TRELLIS_PARAMS, int b_interlaced );
#define x264_trellis_cabac_4x4_psy_sse2 x264_template(trellis_cabac_4x4_psy_sse2)
int x264_trellis_cabac_4x4_psy_sse2 ( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int i_psy_trellis );
#define x264_trellis_cabac_4x4_psy_ssse3 x264_template(trellis_cabac_4x4_psy_ssse3)
int x264_trellis_cabac_4x4_psy_ssse3( TRELLIS_PARAMS, int b_ac, dctcoef *fenc_dct, int i_psy_trellis );
#define x264_trellis_cabac_8x8_psy_sse2 x264_template(trellis_cabac_8x8_psy_sse2)
int x264_trellis_cabac_8x8_psy_sse2 ( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int i_psy_trellis );
#define x264_trellis_cabac_8x8_psy_ssse3 x264_template(trellis_cabac_8x8_psy_ssse3)
int x264_trellis_cabac_8x8_psy_ssse3( TRELLIS_PARAMS, int b_interlaced, dctcoef *fenc_dct, int i_psy_trellis );
#define x264_trellis_cabac_dc_sse2 x264_template(trellis_cabac_dc_sse2)
int x264_trellis_cabac_dc_sse2 ( TRELLIS_PARAMS, int i_coefs );
#define x264_trellis_cabac_dc_ssse3 x264_template(trellis_cabac_dc_ssse3)
int x264_trellis_cabac_dc_ssse3( TRELLIS_PARAMS, int i_coefs );
#define x264_trellis_cabac_chroma_422_dc_sse2 x264_template(trellis_cabac_chroma_422_dc_sse2)
int x264_trellis_cabac_chroma_422_dc_sse2 ( TRELLIS_PARAMS );
#define x264_trellis_cabac_chroma_422_dc_ssse3 x264_template(trellis_cabac_chroma_422_dc_ssse3)
int x264_trellis_cabac_chroma_422_dc_ssse3( TRELLIS_PARAMS );
#endif

2215
common/x86/sad-a.asm Normal file

File diff suppressed because it is too large Load Diff

727
common/x86/sad16-a.asm Normal file
View File

@@ -0,0 +1,727 @@
;*****************************************************************************
;* sad16-a.asm: x86 high depth sad functions
;*****************************************************************************
;* Copyright (C) 2010-2025 x264 project
;*
;* Authors: Oskar Arvidsson <oskar@irock.se>
;* Henrik Gramner <henrik@gramner.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
%include "x86inc.asm"
%include "x86util.asm"
SECTION .text
cextern pw_1
cextern pw_4
cextern pw_8
;=============================================================================
; SAD MMX
;=============================================================================
%macro SAD_INC_1x16P_MMX 0
movu m1, [r0+ 0]
movu m2, [r0+ 8]
movu m3, [r0+16]
movu m4, [r0+24]
psubw m1, [r2+ 0]
psubw m2, [r2+ 8]
psubw m3, [r2+16]
psubw m4, [r2+24]
ABSW2 m1, m2, m1, m2, m5, m6
ABSW2 m3, m4, m3, m4, m7, m5
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddw m1, m2
paddw m3, m4
paddw m0, m1
paddw m0, m3
%endmacro
%macro SAD_INC_2x8P_MMX 0
movu m1, [r0+0]
movu m2, [r0+8]
movu m3, [r0+2*r1+0]
movu m4, [r0+2*r1+8]
psubw m1, [r2+0]
psubw m2, [r2+8]
psubw m3, [r2+2*r3+0]
psubw m4, [r2+2*r3+8]
ABSW2 m1, m2, m1, m2, m5, m6
ABSW2 m3, m4, m3, m4, m7, m5
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
paddw m1, m2
paddw m3, m4
paddw m0, m1
paddw m0, m3
%endmacro
%macro SAD_INC_2x4P_MMX 0
movu m1, [r0]
movu m2, [r0+2*r1]
psubw m1, [r2]
psubw m2, [r2+2*r3]
ABSW2 m1, m2, m1, m2, m3, m4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
paddw m0, m1
paddw m0, m2
%endmacro
;-----------------------------------------------------------------------------
; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SAD_MMX 3
cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
pxor m0, m0
%if %2 == 4
SAD_INC_%3x%1P_MMX
SAD_INC_%3x%1P_MMX
%else
mov r4d, %2/%3
.loop:
SAD_INC_%3x%1P_MMX
dec r4d
jg .loop
%endif
%if %1*%2 == 256
HADDUW m0, m1
%else
HADDW m0, m1
%endif
movd eax, m0
RET
%endmacro
INIT_MMX mmx2
SAD_MMX 16, 16, 1
SAD_MMX 16, 8, 1
SAD_MMX 8, 16, 2
SAD_MMX 8, 8, 2
SAD_MMX 8, 4, 2
SAD_MMX 4, 8, 2
SAD_MMX 4, 4, 2
INIT_MMX ssse3
SAD_MMX 4, 8, 2
SAD_MMX 4, 4, 2
;=============================================================================
; SAD XMM
;=============================================================================
%macro SAD_INC_2ROW 1
%if 2*%1 > mmsize
movu m1, [r2+ 0]
movu m2, [r2+16]
movu m3, [r2+2*r3+ 0]
movu m4, [r2+2*r3+16]
psubw m1, [r0+ 0]
psubw m2, [r0+16]
psubw m3, [r0+2*r1+ 0]
psubw m4, [r0+2*r1+16]
ABSW2 m1, m2, m1, m2, m5, m6
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
ABSW2 m3, m4, m3, m4, m7, m5
paddw m1, m2
paddw m3, m4
paddw m0, m1
paddw m0, m3
%else
movu m1, [r2]
movu m2, [r2+2*r3]
psubw m1, [r0]
psubw m2, [r0+2*r1]
ABSW2 m1, m2, m1, m2, m3, m4
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
paddw m0, m1
paddw m0, m2
%endif
%endmacro
;-----------------------------------------------------------------------------
; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
;-----------------------------------------------------------------------------
%macro SAD 2
cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
pxor m0, m0
%if %2 == 4
SAD_INC_2ROW %1
SAD_INC_2ROW %1
%else
mov r4d, %2/2
.loop:
SAD_INC_2ROW %1
dec r4d
jg .loop
%endif
HADDW m0, m1
movd eax, xm0
RET
%endmacro
INIT_XMM sse2
SAD 16, 16
SAD 16, 8
SAD 8, 16
SAD 8, 8
SAD 8, 4
INIT_XMM sse2, aligned
SAD 16, 16
SAD 16, 8
SAD 8, 16
SAD 8, 8
INIT_XMM ssse3
SAD 16, 16
SAD 16, 8
SAD 8, 16
SAD 8, 8
SAD 8, 4
INIT_XMM ssse3, aligned
SAD 16, 16
SAD 16, 8
SAD 8, 16
SAD 8, 8
INIT_YMM avx2
SAD 16, 16
SAD 16, 8
;=============================================================================
; SAD x3/x4
;=============================================================================
%macro SAD_X3_INC_P 0
add r0, 4*FENC_STRIDE
lea r1, [r1+4*r4]
lea r2, [r2+4*r4]
lea r3, [r3+4*r4]
%endmacro
%macro SAD_X3_ONE_START 0
mova m3, [r0]
movu m0, [r1]
movu m1, [r2]
movu m2, [r3]
psubw m0, m3
psubw m1, m3
psubw m2, m3
ABSW2 m0, m1, m0, m1, m4, m5
ABSW m2, m2, m6
%endmacro
%macro SAD_X3_ONE 2
mova m6, [r0+%1]
movu m3, [r1+%2]
movu m4, [r2+%2]
movu m5, [r3+%2]
psubw m3, m6
psubw m4, m6
psubw m5, m6
ABSW2 m3, m4, m3, m4, m7, m6
ABSW m5, m5, m6
paddw m0, m3
paddw m1, m4
paddw m2, m5
%endmacro
%macro SAD_X3_END 2
%if mmsize == 8 && %1*%2 == 256
HADDUW m0, m3
HADDUW m1, m4
HADDUW m2, m5
%else
HADDW m0, m3
HADDW m1, m4
HADDW m2, m5
%endif
%if UNIX64
movd [r5+0], xm0
movd [r5+4], xm1
movd [r5+8], xm2
%else
mov r0, r5mp
movd [r0+0], xm0
movd [r0+4], xm1
movd [r0+8], xm2
%endif
RET
%endmacro
%macro SAD_X4_INC_P 0
add r0, 4*FENC_STRIDE
lea r1, [r1+4*r5]
lea r2, [r2+4*r5]
lea r3, [r3+4*r5]
lea r4, [r4+4*r5]
%endmacro
%macro SAD_X4_ONE_START 0
mova m4, [r0]
movu m0, [r1]
movu m1, [r2]
movu m2, [r3]
movu m3, [r4]
psubw m0, m4
psubw m1, m4
psubw m2, m4
psubw m3, m4
ABSW2 m0, m1, m0, m1, m5, m6
ABSW2 m2, m3, m2, m3, m4, m7
%endmacro
%macro SAD_X4_ONE 2
mova m4, [r0+%1]
movu m5, [r1+%2]
movu m6, [r2+%2]
%if num_mmregs > 8
movu m7, [r3+%2]
movu m8, [r4+%2]
psubw m5, m4
psubw m6, m4
psubw m7, m4
psubw m8, m4
ABSW2 m5, m6, m5, m6, m9, m10
ABSW2 m7, m8, m7, m8, m9, m10
paddw m0, m5
paddw m1, m6
paddw m2, m7
paddw m3, m8
%elif cpuflag(ssse3)
movu m7, [r3+%2]
psubw m5, m4
psubw m6, m4
psubw m7, m4
movu m4, [r4+%2]
pabsw m5, m5
psubw m4, [r0+%1]
pabsw m6, m6
pabsw m7, m7
pabsw m4, m4
paddw m0, m5
paddw m1, m6
paddw m2, m7
paddw m3, m4
%else ; num_mmregs == 8 && !ssse3
psubw m5, m4
psubw m6, m4
ABSW m5, m5, m7
ABSW m6, m6, m7
paddw m0, m5
paddw m1, m6
movu m5, [r3+%2]
movu m6, [r4+%2]
psubw m5, m4
psubw m6, m4
ABSW2 m5, m6, m5, m6, m7, m4
paddw m2, m5
paddw m3, m6
%endif
%endmacro
%macro SAD_X4_END 2
%if mmsize == 8 && %1*%2 == 256
HADDUW m0, m4
HADDUW m1, m5
HADDUW m2, m6
HADDUW m3, m7
%else
HADDW m0, m4
HADDW m1, m5
HADDW m2, m6
HADDW m3, m7
%endif
mov r0, r6mp
movd [r0+ 0], xm0
movd [r0+ 4], xm1
movd [r0+ 8], xm2
movd [r0+12], xm3
RET
%endmacro
%macro SAD_X_2xNP 4
%assign x %3
%rep %4
SAD_X%1_ONE x*mmsize, x*mmsize
SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
%assign x x+1
%endrep
%endmacro
%macro PIXEL_VSAD 0
cglobal pixel_vsad, 3,3,8
mova m0, [r0]
mova m1, [r0+16]
mova m2, [r0+2*r1]
mova m3, [r0+2*r1+16]
lea r0, [r0+4*r1]
psubw m0, m2
psubw m1, m3
ABSW2 m0, m1, m0, m1, m4, m5
paddw m0, m1
sub r2d, 2
je .end
.loop:
mova m4, [r0]
mova m5, [r0+16]
mova m6, [r0+2*r1]
mova m7, [r0+2*r1+16]
lea r0, [r0+4*r1]
psubw m2, m4
psubw m3, m5
psubw m4, m6
psubw m5, m7
ABSW m2, m2, m1
ABSW m3, m3, m1
ABSW m4, m4, m1
ABSW m5, m5, m1
paddw m0, m2
paddw m0, m3
paddw m0, m4
paddw m0, m5
mova m2, m6
mova m3, m7
sub r2d, 2
jg .loop
.end:
%if BIT_DEPTH == 9
HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
%else
HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
%endif
movd eax, m0
RET
%endmacro
INIT_XMM sse2
PIXEL_VSAD
INIT_XMM ssse3
PIXEL_VSAD
INIT_XMM xop
PIXEL_VSAD
INIT_YMM avx2
cglobal pixel_vsad, 3,3
mova m0, [r0]
mova m1, [r0+2*r1]
lea r0, [r0+4*r1]
psubw m0, m1
pabsw m0, m0
sub r2d, 2
je .end
.loop:
mova m2, [r0]
mova m3, [r0+2*r1]
lea r0, [r0+4*r1]
psubw m1, m2
psubw m2, m3
pabsw m1, m1
pabsw m2, m2
paddw m0, m1
paddw m0, m2
mova m1, m3
sub r2d, 2
jg .loop
.end:
%if BIT_DEPTH == 9
HADDW m0, m1
%else
HADDUW m0, m1
%endif
movd eax, xm0
RET
;-----------------------------------------------------------------------------
; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
; uint16_t *pix2, intptr_t i_stride, int scores[3] )
;-----------------------------------------------------------------------------
%macro SAD_X 3
cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
%assign regnum %1+1
%xdefine STRIDE r %+ regnum
mov r6, %3/2-1
SAD_X%1_ONE_START
SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
.loop:
SAD_X%1_INC_P
SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
dec r6
jg .loop
%if %1 == 4
mov r6, r6m
%endif
SAD_X%1_END %2, %3
%endmacro
INIT_MMX mmx2
%define XMM_REGS 0
SAD_X 3, 16, 16
SAD_X 3, 16, 8
SAD_X 3, 8, 16
SAD_X 3, 8, 8
SAD_X 3, 8, 4
SAD_X 3, 4, 8
SAD_X 3, 4, 4
SAD_X 4, 16, 16
SAD_X 4, 16, 8
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
SAD_X 4, 4, 8
SAD_X 4, 4, 4
INIT_MMX ssse3
SAD_X 3, 4, 8
SAD_X 3, 4, 4
SAD_X 4, 4, 8
SAD_X 4, 4, 4
INIT_XMM ssse3
%define XMM_REGS 7
SAD_X 3, 16, 16
SAD_X 3, 16, 8
SAD_X 3, 8, 16
SAD_X 3, 8, 8
SAD_X 3, 8, 4
%define XMM_REGS 9
SAD_X 4, 16, 16
SAD_X 4, 16, 8
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
INIT_XMM sse2
%define XMM_REGS 8
SAD_X 3, 16, 16
SAD_X 3, 16, 8
SAD_X 3, 8, 16
SAD_X 3, 8, 8
SAD_X 3, 8, 4
%define XMM_REGS 11
SAD_X 4, 16, 16
SAD_X 4, 16, 8
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
INIT_XMM xop
%define XMM_REGS 7
SAD_X 3, 16, 16
SAD_X 3, 16, 8
SAD_X 3, 8, 16
SAD_X 3, 8, 8
SAD_X 3, 8, 4
%define XMM_REGS 9
SAD_X 4, 16, 16
SAD_X 4, 16, 8
SAD_X 4, 8, 16
SAD_X 4, 8, 8
SAD_X 4, 8, 4
INIT_YMM avx2
%define XMM_REGS 7
SAD_X 3, 16, 16
SAD_X 3, 16, 8
%define XMM_REGS 9
SAD_X 4, 16, 16
SAD_X 4, 16, 8
;-----------------------------------------------------------------------------
; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
%macro INTRA_SAD_X3_4x4 0
cglobal intra_sad_x3_4x4, 3,3,7
%if cpuflag(ssse3)
movddup m0, [r1-1*FDEC_STRIDEB]
%else
movq m0, [r1-1*FDEC_STRIDEB]
punpcklqdq m0, m0
%endif
movq m1, [r0+0*FENC_STRIDEB]
movq m2, [r0+2*FENC_STRIDEB]
pshuflw m6, m0, q1032
paddw m6, m0
pshuflw m5, m6, q2301
paddw m6, m5
punpcklqdq m6, m6 ; A+B+C+D 8 times
movhps m1, [r0+1*FENC_STRIDEB]
movhps m2, [r0+3*FENC_STRIDEB]
psubw m3, m1, m0
psubw m0, m2
ABSW2 m3, m0, m3, m0, m4, m5
paddw m0, m3
movd m3, [r1+0*FDEC_STRIDEB-4]
movd m4, [r1+2*FDEC_STRIDEB-4]
movhps m3, [r1+1*FDEC_STRIDEB-8]
movhps m4, [r1+3*FDEC_STRIDEB-8]
pshufhw m3, m3, q3333
pshufhw m4, m4, q3333
pshuflw m3, m3, q1111 ; FF FF EE EE
pshuflw m4, m4, q1111 ; HH HH GG GG
paddw m5, m3, m4
paddw m6, [pw_4]
paddw m6, m5
pshufd m5, m5, q1032
paddw m5, m6
psrlw m5, 3
psubw m6, m5, m2
psubw m5, m1
psubw m1, m3
psubw m2, m4
ABSW2 m5, m6, m5, m6, m3, m4
ABSW2 m1, m2, m1, m2, m3, m4
paddw m5, m6
paddw m1, m2
%if cpuflag(ssse3)
phaddw m0, m1
movhlps m3, m5
paddw m5, m3
phaddw m0, m5
pmaddwd m0, [pw_1]
mova [r2], m0
%else
HADDW m0, m3
HADDW m1, m3
HADDW m5, m3
movd [r2], m0 ; V prediction cost
movd [r2+4], m1 ; H prediction cost
movd [r2+8], m5 ; DC prediction cost
%endif
RET
%endmacro
INIT_XMM sse2
INTRA_SAD_X3_4x4
INIT_XMM ssse3
INTRA_SAD_X3_4x4
INIT_XMM avx
INTRA_SAD_X3_4x4
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] );
;-----------------------------------------------------------------------------
;m0 = DC
;m6 = V
;m7 = H
;m1 = DC score
;m2 = V score
;m3 = H score
;m5 = temp
;m4 = pixel row
%macro INTRA_SAD_HVDC_ITER 2
mova m4, [r0+(%1-4)*FENC_STRIDEB]
psubw m4, m0
ABSW m4, m4, m5
ACCUM paddw, 1, 4, %1
mova m4, [r0+(%1-4)*FENC_STRIDEB]
psubw m4, m6
ABSW m4, m4, m5
ACCUM paddw, 2, 4, %1
pshufd m5, m7, %2
psubw m5, [r0+(%1-4)*FENC_STRIDEB]
ABSW m5, m5, m4
ACCUM paddw, 3, 5, %1
%endmacro
%macro INTRA_SAD_X3_8x8 0
cglobal intra_sad_x3_8x8, 3,3,8
add r0, 4*FENC_STRIDEB
movu m0, [r1+7*SIZEOF_PIXEL]
mova m6, [r1+16*SIZEOF_PIXEL] ;V prediction
mova m7, m0
paddw m0, m6
punpckhwd m7, m7
HADDW m0, m4
paddw m0, [pw_8]
psrlw m0, 4
SPLATW m0, m0
INTRA_SAD_HVDC_ITER 0, q3333
INTRA_SAD_HVDC_ITER 1, q2222
INTRA_SAD_HVDC_ITER 2, q1111
INTRA_SAD_HVDC_ITER 3, q0000
movq m7, [r1+7*SIZEOF_PIXEL]
punpcklwd m7, m7
INTRA_SAD_HVDC_ITER 4, q3333
INTRA_SAD_HVDC_ITER 5, q2222
INTRA_SAD_HVDC_ITER 6, q1111
INTRA_SAD_HVDC_ITER 7, q0000
%if cpuflag(ssse3)
phaddw m2, m3 ; 2 2 2 2 3 3 3 3
movhlps m3, m1
paddw m1, m3 ; 1 1 1 1 _ _ _ _
phaddw m2, m1 ; 2 2 3 3 1 1 _ _
pmaddwd m2, [pw_1] ; 2 3 1 _
mova [r2], m2
%else
HADDW m2, m4
HADDW m3, m4
HADDW m1, m4
movd [r2+0], m2
movd [r2+4], m3
movd [r2+8], m1
%endif
RET
%endmacro
INIT_XMM sse2
INTRA_SAD_X3_8x8
INIT_XMM ssse3
INTRA_SAD_X3_8x8
%macro INTRA_SAD_HVDC_ITER_YMM 2
mova xm4, [r0+(%1-4)*FENC_STRIDEB]
vinserti128 m4, m4, [r0+%1*FENC_STRIDEB], 1
pshufd m5, m7, %2
psubw m5, m4
pabsw m5, m5
ACCUM paddw, 2, 5, %1 ; H
psubw m5, m4, m6
psubw m4, m0
pabsw m5, m5
pabsw m4, m4
ACCUM paddw, 1, 5, %1 ; V
ACCUM paddw, 3, 4, %1 ; DC
%endmacro
INIT_YMM avx2
cglobal intra_sad_x3_8x8, 3,3,8
add r0, 4*FENC_STRIDEB
movu xm0, [r1+7*SIZEOF_PIXEL]
vbroadcasti128 m6, [r1+16*SIZEOF_PIXEL] ; V prediction
vpermq m7, m0, q0011
paddw xm0, xm6
paddw xm0, [pw_1] ; equal to +8 after HADDW
HADDW xm0, xm4
psrld xm0, 4
vpbroadcastw m0, xm0
punpcklwd m7, m7
INTRA_SAD_HVDC_ITER_YMM 0, q3333
INTRA_SAD_HVDC_ITER_YMM 1, q2222
INTRA_SAD_HVDC_ITER_YMM 2, q1111
INTRA_SAD_HVDC_ITER_YMM 3, q0000
phaddw m1, m2 ; 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2
punpckhqdq m2, m3, m3
paddw m3, m2 ; 3 3 3 3 _ _ _ _ 3 3 3 3 _ _ _ _
phaddw m1, m3 ; 1 1 2 2 3 3 _ _ 1 1 2 2 3 3 _ _
vextracti128 xm2, m1, 1
paddw xm1, xm2 ; 1 1 2 2 3 3 _ _
pmaddwd xm1, [pw_1] ; 1 2 3 _
mova [r2], xm1
RET

881
common/x86/trellis-64.asm Normal file
View File

@@ -0,0 +1,881 @@
;*****************************************************************************
;* trellis-64.asm: x86_64 trellis quantization
;*****************************************************************************
;* Copyright (C) 2012-2025 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
; This is a pretty straight-forward translation of the C code, except:
; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level.
; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop:
; 4x parallel, handling 4 node_ctxs of the same coef (even if some of those
; nodes are invalid).
; * Interprocedural register allocation. Eliminates argument-passing overhead
; to trellis_coef* subroutines. Also reduces codesize.
; Optimizations that I tried, and rejected because they were not faster:
; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3].
; Costs too much icache compared to the negligible speedup.
; * There are only 21 possible sets of live node_ctxs; we could keep track of
; exactly which set we're in and feed that (along with abs_level) into a jump
; table instead of the switch to select a trellis_coef subroutine. This would
; eliminate all branches about which node_ctxs are live, but costs either a
; bunch of icache or a bunch of call/ret, and the jump table itself is
; unpredictable.
; * Separate versions of trellis_coef* depending on whether we're doing the 1st
; or the 2nd of the two abs_level candidates. This would eliminate some
; branches about if(score is better).
; * Special case more values of coef. I had a coef2 at some intermediate point
; in the optimization process, but it didn't end up worthwhile in conjunction
; with all the other optimizations.
; * Unroll or simd writeback. I don't know why this didn't help.
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA
pd_m16: times 4 dd -16
sq_1: dq 1, 0
pq_128: times 2 dq 128
pq_ffffffff: times 2 dq 0xffffffff
cextern pd_8
cextern pd_0123
cextern pd_4567
cextern_common cabac_entropy
cextern_common cabac_transition
cextern cabac_size_unary
cextern cabac_transition_unary
cextern_common dct4_weight_tab
cextern_common dct8_weight_tab
cextern_common dct4_weight2_tab
cextern_common dct8_weight2_tab
cextern_common last_coeff_flag_offset_8x8
cextern_common significant_coeff_flag_offset_8x8
cextern_common coeff_flag_offset_chroma_422_dc
SECTION .text
%define TRELLIS_SCORE_BIAS 1<<60
%define SIZEOF_NODE 16
%define CABAC_SIZE_BITS 8
%define LAMBDA_BITS 4
%macro SQUARE 2 ; dst, tmp
; could use pmuldq here, to eliminate the abs. but that would involve
; templating a sse4 version of all of trellis, for negligible speedup.
%if cpuflag(ssse3)
pabsd m%1, m%1
pmuludq m%1, m%1
%elif HIGH_BIT_DEPTH
ABSD m%2, m%1
SWAP %1, %2
pmuludq m%1, m%1
%else
pmuludq m%1, m%1
pand m%1, [pq_ffffffff]
%endif
%endmacro
%macro LOAD_DUP 2 ; dst, src
%if cpuflag(ssse3)
movddup %1, %2
%else
movd %1, %2
punpcklqdq %1, %1
%endif
%endmacro
;-----------------------------------------------------------------------------
; int trellis_cabac_4x4_psy(
; const int *unquant_mf, const uint8_t *zigzag, int lambda2,
; int last_nnz, dctcoef *orig_coefs, dctcoef *quant_coefs, dctcoef *dct,
; uint8_t *cabac_state_sig, uint8_t *cabac_state_last,
; uint64_t level_state0, uint16_t level_state1,
; int b_ac, dctcoef *fenc_dct, int psy_trellis )
;-----------------------------------------------------------------------------
%macro TRELLIS 4
%define num_coefs %2
%define dc %3
%define psy %4
cglobal %1, 4,15,9
%assign level_tree_size 64*8*2*4 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef*
%assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
SUB rsp, pad
DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
%if WIN64
%define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
%else
%define level_statem rsp+stack_offset+32
%endif
%define b_acm r11m ; 4x4 only
%define b_interlacedm r11m ; 8x8 only
%define i_coefsm1 r11m ; dc only
%define fenc_dctm r12m
%define psy_trellism r13m
%if num_coefs == 64
shl dword b_interlacedm, 6
%define dct_weight1_tab dct8_weight_tab
%define dct_weight2_tab dct8_weight2_tab
%else
%define dct_weight1_tab dct4_weight_tab
%define dct_weight2_tab dct4_weight2_tab
%endif
%define stack rsp
%define last_nnzm [stack+0]
%define zigzagm [stack+8]
mov last_nnzm, iid
mov zigzagm, zigzagq
%if WIN64 == 0
%define orig_coefsm [stack+16]
%define quant_coefsm [stack+24]
mov orig_coefsm, orig_coefsq
mov quant_coefsm, quant_coefsq
%endif
%define unquant_mfm [stack+32]
%define levelgt1_ctxm [stack+40]
%define ssd stack+48
%define cost_siglast stack+80
%define level_tree stack+96
; trellis_node_t is laid out differently than C.
; struct-of-arrays rather than array-of-structs, for simd.
%define nodes_curq r7
%define nodes_prevq r8
%define node_score(x) x*8
%define node_level_idx(x) 64+x*4
%define node_cabac_state(x) 96+x*4
lea nodes_curq, [level_tree + level_tree_size]
lea nodes_prevq, [nodes_curq + 8*SIZEOF_NODE]
mov r6, TRELLIS_SCORE_BIAS
mov [nodes_curq + node_score(0)], r6
mov dword [nodes_curq + node_level_idx(0)], 0
movd mm0, [level_statem + 0]
punpcklbw mm0, [level_statem + 4]
punpcklwd mm0, [level_statem + 8]
%define level_state_packed mm0 ; version for copying into node.cabac_state
pcmpeqb m7, m7 ; TRELLIS_SCORE_MAX
movq [nodes_curq + node_score(1)], m7
mova [nodes_curq + node_score(2)], m7
%define levels_usedq r4
%define levels_usedd r4d
mov dword [level_tree], 0
mov levels_usedd, 1
%define abs_levelq r9
%define abs_leveld r9d
%define abs_coefq r14
%define zigzagiq r5
%define zigzagid r5d
%if num_coefs == 8
mov dword levelgt1_ctxm, 8
%else
mov dword levelgt1_ctxm, 9
%endif
%if psy
LOAD_DUP m6, psy_trellism
%define psy_trellis m6
%elif dc
LOAD_DUP m6, [unquant_mfq]
paddd m6, m6
%define unquant_mf m6
%endif
%if dc == 0
mov unquant_mfm, unquant_mfq
%endif
; Keep a single offset register to PICify all global constants.
; They're all relative to "beginning of this asm file's .text section",
; even tables that aren't in this file.
; (Any address in .text would work, this one was just convenient.)
lea r0, [$$]
%define GLOBAL +r0-$$
TRELLIS_LOOP 0 ; node_ctx 0..3
TRELLIS_LOOP 1 ; node_ctx 1..7
.writeback:
; int level = bnode->level_idx;
; for( int i = b_ac; i <= last_nnz; i++ )
; dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]);
; level = level_tree[level].next;
mov iid, last_nnzm
add zigzagq, iiq
neg iiq
%if num_coefs == 16 && dc == 0
mov r2d, b_acm
add iiq, r2
%endif
%define dctq r10
mov r0d, [nodes_curq + node_level_idx(0) + rax*4]
.writeback_loop:
movzx r2, byte [zigzagq + iiq]
%if cpuflag(ssse3)
movd m0, [level_tree + r0*4]
movzx r0, word [level_tree + r0*4]
psrld m0, 16
movd m1, [dctq + r2*SIZEOF_DCTCOEF]
%if HIGH_BIT_DEPTH
psignd m0, m1
movd [dctq + r2*SIZEOF_DCTCOEF], m0
%else
psignw m0, m1
movd r4d, m0
mov [dctq + r2*SIZEOF_DCTCOEF], r4w
%endif
%else
mov r5d, [level_tree + r0*4]
%if HIGH_BIT_DEPTH
mov r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
%else
movsx r4d, word [dctq + r2*SIZEOF_DCTCOEF]
%endif
movzx r0d, r5w
sar r4d, 31
shr r5d, 16
xor r5d, r4d
sub r5d, r4d
%if HIGH_BIT_DEPTH
mov [dctq + r2*SIZEOF_DCTCOEF], r5d
%else
mov [dctq + r2*SIZEOF_DCTCOEF], r5w
%endif
%endif
inc iiq
jle .writeback_loop
mov eax, 1
.return:
ADD rsp, pad
RET
%if num_coefs == 16 && dc == 0
.return_zero:
pxor m0, m0
mova [r10+ 0], m0
mova [r10+16], m0
%if HIGH_BIT_DEPTH
mova [r10+32], m0
mova [r10+48], m0
%endif
jmp .return
%endif
%endmacro ; TRELLIS
%macro TRELLIS_LOOP 1 ; ctx_hi
.i_loop%1:
; if( !quant_coefs[i] )
mov r6, quant_coefsm
%if HIGH_BIT_DEPTH
mov abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
%else
movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
%endif
; int sigindex = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
mov r10, cabac_state_sigm
%if num_coefs == 64
mov r6d, b_interlacedm
add r6d, iid
movzx r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
movzx r10, byte [r10 + r6]
%elif num_coefs == 8
movzx r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
movzx r10, byte [r10 + r13]
%else
movzx r10, byte [r10 + iiq]
%endif
test abs_leveld, abs_leveld
jnz %%.nonzero_quant_coef
%if %1 == 0
; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
; * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
; nodes_cur[0].score -= cost_sig0;
movzx r10, word [cabac_entropy + r10*2 GLOBAL]
imul r10, lambda2q
shr r10, CABAC_SIZE_BITS - LAMBDA_BITS
sub [nodes_curq + node_score(0)], r10
%endif
ZERO_LEVEL_IDX %1, cur
jmp .i_continue%1
%%.nonzero_quant_coef:
; int sign_coef = orig_coefs[zigzag[i]];
; int abs_coef = abs( sign_coef );
; int q = abs( quant_coefs[i] );
movzx zigzagid, byte [zigzagq+iiq]
movd m0, abs_leveld
mov r6, orig_coefsm
%if HIGH_BIT_DEPTH
LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
%else
LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
psrad m1, 16 ; sign_coef
%endif
punpcklqdq m0, m0 ; quant_coef
%if cpuflag(ssse3)
pabsd m0, m0
pabsd m2, m1 ; abs_coef
%else
pxor m8, m8
pcmpgtd m8, m1 ; sign_mask
pxor m0, m8
pxor m2, m1, m8
psubd m0, m8
psubd m2, m8
%endif
psubd m0, [sq_1] ; abs_level
movd abs_leveld, m0
xchg nodes_curq, nodes_prevq
; if( i < num_coefs-1 )
; int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
; num_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i
; cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
; cost_sig1 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
; cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;
; cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;
%if %1 == 0
%if dc && num_coefs != 8
cmp iid, i_coefsm1
%else
cmp iid, num_coefs-1
%endif
je %%.zero_siglast
%endif
movzx r11, word [cabac_entropy + r10*2 GLOBAL]
xor r10, 1
movzx r12, word [cabac_entropy + r10*2 GLOBAL]
mov [cost_siglast+0], r11d
mov r10, cabac_state_lastm
%if num_coefs == 64
movzx r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL]
movzx r10, byte [r10 + r6]
%elif num_coefs == 8
movzx r10, byte [r10 + r13]
%else
movzx r10, byte [r10 + iiq]
%endif
movzx r11, word [cabac_entropy + r10*2 GLOBAL]
add r11, r12
mov [cost_siglast+4], r11d
%if %1 == 0
xor r10, 1
movzx r10, word [cabac_entropy + r10*2 GLOBAL]
add r10, r12
mov [cost_siglast+8], r10d
%endif
%%.skip_siglast:
; int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
; int d = abs_coef - unquant_abs_level;
; uint64_t ssd = (int64_t)d*d * coef_weight[i];
%if dc
pmuludq m0, unquant_mf
%else
mov r10, unquant_mfm
LOAD_DUP m3, [r10 + zigzagiq*4]
pmuludq m0, m3
%endif
paddd m0, [pq_128]
psrld m0, 8 ; unquant_abs_level
%if psy || dc == 0
mova m4, m0
%endif
psubd m0, m2
SQUARE 0, 3
%if dc
psllq m0, 8
%else
LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
pmuludq m0, m5
%endif
%if psy
test iid, iid
jz %%.dc_rounding
; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef
; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));
; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
; ssd1[k] -= psy_weight * psy_value;
mov r6, fenc_dctm
%if HIGH_BIT_DEPTH
LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
%else
LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
psrad m3, 16 ; orig_coef
%endif
%if cpuflag(ssse3)
psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef)
%else
PSIGN d, m4, m8
%endif
psubd m3, m1 ; predicted_coef
paddd m4, m3
%if cpuflag(ssse3)
pabsd m4, m4
%else
ABSD m3, m4
SWAP 4, 3
%endif
LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
pmuludq m1, psy_trellis
pmuludq m4, m1
psubq m0, m4
%if %1
%%.dc_rounding:
%endif
%endif
%if %1 == 0
mova [ssd], m0
%endif
%if dc == 0 && %1 == 0
test iid, iid
jnz %%.skip_dc_rounding
%%.dc_rounding:
; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks.
; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15);
; uint64_t ssd = (int64_t)d*d * coef_weight[i];
psrad m1, 31 ; sign_coef>>31
paddd m4, [pd_8]
paddd m4, m1
pand m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15
psubd m4, m2 ; d
SQUARE 4, 3
pmuludq m4, m5
mova [ssd], m4
%%.skip_dc_rounding:
%endif
mova [ssd+16], m0
%assign stack_offset_bak stack_offset
cmp abs_leveld, 1
jl %%.switch_coef0
%if %1 == 0
mov r10, [ssd] ; trellis_coef* args
%endif
movq r12, m0
; for( int j = 0; j < 8; j++ )
; nodes_cur[j].score = TRELLIS_SCORE_MAX;
%if cpuflag(ssse3)
mova [nodes_curq + node_score(0)], m7
mova [nodes_curq + node_score(2)], m7
%else ; avoid store-forwarding stalls on k8/k10
%if %1 == 0
movq [nodes_curq + node_score(0)], m7
%endif
movq [nodes_curq + node_score(1)], m7
movq [nodes_curq + node_score(2)], m7
movq [nodes_curq + node_score(3)], m7
%endif
mova [nodes_curq + node_score(4)], m7
mova [nodes_curq + node_score(6)], m7
je %%.switch_coef1
%%.switch_coefn:
call trellis_coefn.entry%1
call trellis_coefn.entry%1b
jmp .i_continue1
%%.switch_coef1:
call trellis_coef1.entry%1
call trellis_coefn.entry%1b
jmp .i_continue1
%%.switch_coef0:
call trellis_coef0_%1
call trellis_coef1.entry%1b
.i_continue%1:
dec iid
%if num_coefs == 16 && dc == 0
cmp iid, b_acm
%endif
jge .i_loop%1
call trellis_bnode_%1
%if %1 == 0
%if num_coefs == 16 && dc == 0
jz .return_zero
%else
jz .return
%endif
jmp .writeback
%%.zero_siglast:
xor r6d, r6d
mov [cost_siglast+0], r6
mov [cost_siglast+8], r6d
jmp %%.skip_siglast
%endif
%endmacro ; TRELLIS_LOOP
; just a synonym for %if
%macro IF0 1+
%endmacro
%macro IF1 1+
%1
%endmacro
%macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev
; for( int j = 0; j < 8; j++ )
; nodes_cur[j].level_idx = levels_used;
; level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 };
; levels_used++;
add levels_usedd, 3
and levels_usedd, ~3 ; allow aligned stores
movd m0, levels_usedd
pshufd m0, m0, 0
IF%1 mova m1, m0
paddd m0, [pd_0123]
IF%1 paddd m1, [pd_4567]
mova m2, [nodes_%2q + node_level_idx(0)]
IF%1 mova m3, [nodes_%2q + node_level_idx(4)]
mova [nodes_curq + node_level_idx(0)], m0
IF%1 mova [nodes_curq + node_level_idx(4)], m1
mova [level_tree + (levels_usedq+0)*4], m2
IF%1 mova [level_tree + (levels_usedq+4)*4], m3
add levels_usedd, (1+%1)*4
%endmacro
INIT_XMM sse2
TRELLIS trellis_cabac_4x4, 16, 0, 0
TRELLIS trellis_cabac_8x8, 64, 0, 0
TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
TRELLIS trellis_cabac_dc, 16, 1, 0
TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
INIT_XMM ssse3
TRELLIS trellis_cabac_4x4, 16, 0, 0
TRELLIS trellis_cabac_8x8, 64, 0, 0
TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
TRELLIS trellis_cabac_dc, 16, 1, 0
TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
%define stack rsp+gprsize
%define scoreq r14
%define bitsq r13
%define bitsd r13d
INIT_XMM
%macro clocal 1
ALIGN 16
global mangle(private_prefix %+ _%1)
mangle(private_prefix %+ _%1):
%1:
%assign stack_offset stack_offset_bak+gprsize
%endmacro
%macro TRELLIS_BNODE 1 ; ctx_hi
clocal trellis_bnode_%1
; int j = ctx_hi?1:0;
; trellis_node_t *bnode = &nodes_cur[j];
; while( ++j < (ctx_hi?8:4) )
; if( nodes_cur[j].score < bnode->score )
; bnode = &nodes_cur[j];
%assign j %1
mov rax, [nodes_curq + node_score(j)]
lea rax, [rax*8 + j]
%rep 3+3*%1
%assign j j+1
mov r11, [nodes_curq + node_score(j)]
lea r11, [r11*8 + j]
cmp rax, r11
cmova rax, r11
%endrep
mov r10, dctm
and eax, 7
ret
%endmacro ; TRELLIS_BNODE
TRELLIS_BNODE 0
TRELLIS_BNODE 1
%macro TRELLIS_COEF0 1 ; ctx_hi
clocal trellis_coef0_%1
; ssd1 += (uint64_t)cost_sig * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
mov r11d, [cost_siglast+0]
imul r11, lambda2q
shr r11, CABAC_SIZE_BITS - LAMBDA_BITS
add r11, [ssd+16]
%if %1 == 0
; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1;
mov scoreq, [nodes_prevq + node_score(0)]
add scoreq, [ssd]
sub scoreq, r11
mov [nodes_curq + node_score(0)], scoreq
%endif
; memcpy
mov scoreq, [nodes_prevq + node_score(1)]
mov [nodes_curq + node_score(1)], scoreq
mova m1, [nodes_prevq + node_score(2)]
mova [nodes_curq + node_score(2)], m1
%if %1
mova m1, [nodes_prevq + node_score(4)]
mova [nodes_curq + node_score(4)], m1
mova m1, [nodes_prevq + node_score(6)]
mova [nodes_curq + node_score(6)], m1
%endif
mov r6d, [nodes_prevq + node_cabac_state(3)]
mov [nodes_curq + node_cabac_state(3)], r6d
%if %1
mova m1, [nodes_prevq + node_cabac_state(4)]
mova [nodes_curq + node_cabac_state(4)], m1
%endif
ZERO_LEVEL_IDX %1, prev
ret
%endmacro ; TRELLIS_COEF0
TRELLIS_COEF0 0
TRELLIS_COEF0 1
%macro START_COEF 1 ; gt1
; if( (int64_t)nodes_prev[0].score < 0 ) continue;
mov scoreq, [nodes_prevq + node_score(j)]
%if j > 0
test scoreq, scoreq
js .ctx %+ nextj_if_invalid
%endif
; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 );
%if j >= 3
movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4
movzx r11, byte [cabac_transition + r6*2 + %1 GLOBAL]
%else
movzx r6d, byte [level_statem + coeff_abs_level1_offs]
%endif
%if %1
xor r6d, 1
%endif
movzx bitsd, word [cabac_entropy + r6*2 GLOBAL]
; n.score += ssd;
; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
%if j == 0
add scoreq, r10
add bitsd, [cost_siglast+8]
%else
add scoreq, r12
add bitsd, [cost_siglast+4]
%endif
%endmacro ; START_COEF
%macro END_COEF 1
; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
imul bitsq, lambda2q
shr bitsq, CABAC_SIZE_BITS - LAMBDA_BITS
add scoreq, bitsq
; if( n.score < nodes_cur[node_ctx].score )
; SET_LEVEL( n, abs_level );
; nodes_cur[node_ctx] = n;
cmp scoreq, [nodes_curq + node_score(node_ctx)]
jae .ctx %+ nextj_if_valid
mov [nodes_curq + node_score(node_ctx)], scoreq
%if j == 2 || (j <= 3 && node_ctx == 4)
; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states
movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed
%elif j >= 3
; if we have updated before, then copy cabac_state from the parent node
mov r6d, [nodes_prevq + node_cabac_state(j)]
mov [nodes_curq + node_cabac_state(node_ctx)], r6d
%endif
%if j >= 3 ; skip the transition if we're not going to reuse the context
mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2
%endif
%if %1 && node_ctx == 7
mov r6d, levelgt1_ctxm
mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b
%endif
mov r6d, [nodes_prevq + node_level_idx(j)]
%if %1
mov r11d, abs_leveld
shl r11d, 16
or r6d, r11d
%else
or r6d, 1<<16
%endif
mov [level_tree + levels_usedq*4], r6d
mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd
inc levels_usedd
%endmacro ; END_COEF
%macro COEF1 2
%assign j %1
%assign nextj_if_valid %1+1
%assign nextj_if_invalid %2
%if j < 4
%assign coeff_abs_level1_offs j+1
%else
%assign coeff_abs_level1_offs 0
%endif
%if j < 3
%assign node_ctx j+1
%else
%assign node_ctx j
%endif
.ctx %+ j:
START_COEF 0
add bitsd, 1 << CABAC_SIZE_BITS
END_COEF 0
%endmacro ; COEF1
%macro COEFN 2
%assign j %1
%assign nextj_if_valid %2
%assign nextj_if_invalid %2
%if j < 4
%assign coeff_abs_level1_offs j+1
%assign coeff_abs_levelgt1_offs 5
%else
%assign coeff_abs_level1_offs 0
%assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc
%endif
%if j < 4
%assign node_ctx 4
%elif j < 7
%assign node_ctx j+1
%else
%assign node_ctx 7
%endif
.ctx %+ j:
START_COEF 1
; if( abs_level >= 15 )
; bits += bs_size_ue_big(...)
add bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX
; n.cabac_state[levelgt1_ctx]
%if j == 7 ; && compiling support for 4:2:2
mov r6d, levelgt1_ctxm
%define coeff_abs_levelgt1_offs r6
%endif
%if j == 7
movzx r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9
%else
movzx r10, byte [level_statem + coeff_abs_levelgt1_offs]
%endif
; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]];
add r10d, r1d
movzx r6d, word [cabac_size_unary + (r10-128)*2 GLOBAL]
add bitsd, r6d
%if node_ctx == 7
movzx r10, byte [cabac_transition_unary + r10-128 GLOBAL]
%endif
END_COEF 1
%endmacro ; COEFN
clocal trellis_coef1
.entry0b: ; ctx_lo, larger of the two abs_level candidates
mov r10, [ssd+8]
sub r10, r11
mov r12, [ssd+24]
sub r12, r11
.entry0: ; ctx_lo, smaller of the two abs_level candidates
COEF1 0, 4
COEF1 1, 4
COEF1 2, 4
COEF1 3, 4
.ctx4:
rep ret
.entry1b: ; ctx_hi, larger of the two abs_level candidates
mov r12, [ssd+24]
sub r12, r11
.entry1: ; ctx_hi, smaller of the two abs_level candidates
trellis_coef1_hi:
COEF1 1, 2
COEF1 2, 3
COEF1 3, 4
COEF1 4, 5
COEF1 5, 6
COEF1 6, 7
COEF1 7, 8
.ctx8:
rep ret
%macro COEFN_PREFIX 1
; int prefix = X264_MIN( abs_level - 1, 14 );
mov r1d, abs_leveld
cmp abs_leveld, 15
jge .level_suffix%1
xor r5d, r5d
.skip_level_suffix%1:
shl r1d, 7
%endmacro
%macro COEFN_SUFFIX 1
.level_suffix%1:
; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
lea r5d, [abs_levelq-14]
bsr r5d, r5d
shl r5d, CABAC_SIZE_BITS+1
add r5d, 1<<CABAC_SIZE_BITS
; int prefix = X264_MIN( abs_level - 1, 14 );
mov r1d, 15
jmp .skip_level_suffix%1
%endmacro
clocal trellis_coefn
.entry0b:
mov r10, [ssd+8]
mov r12, [ssd+24]
inc abs_leveld
.entry0:
; I could fully separate the ctx_lo and ctx_hi versions of coefn, and then
; apply return-on-first-failure to ctx_lo. Or I can use multiple entrypoints
; to merge the common portion of ctx_lo and ctx_hi, and thus reduce codesize.
; I can't do both, as return-on-first-failure doesn't work for ctx_hi.
; The C version has to be fully separate since C doesn't support multiple
; entrypoints. But return-on-first-failure isn't very important here (as
; opposed to coef1), so I might as well reduce codesize.
COEFN_PREFIX 0
COEFN 0, 1
COEFN 1, 2
COEFN 2, 3
COEFN 3, 8
.ctx8:
mov zigzagq, zigzagm ; unspill since r1 was clobbered
ret
.entry1b:
mov r12, [ssd+24]
inc abs_leveld
.entry1:
COEFN_PREFIX 1
COEFN 4, 5
COEFN 5, 6
COEFN 6, 7
COEFN 7, 1
jmp .ctx1
COEFN_SUFFIX 0
COEFN_SUFFIX 1

259
common/x86/util.h Normal file
View File

@@ -0,0 +1,259 @@
/*****************************************************************************
* util.h: x86 inline asm
*****************************************************************************
* Copyright (C) 2008-2025 x264 project
*
* Authors: Fiona Glaser <fiona@x264.com>
* Loren Merritt <lorenm@u.washington.edu>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
*
* This program is also available under a commercial proprietary license.
* For more information, contact us at licensing@x264.com.
*****************************************************************************/
#ifndef X264_X86_UTIL_H
#define X264_X86_UTIL_H
#ifdef __SSE__
#include <xmmintrin.h>
#undef M128_ZERO
#define M128_ZERO ((__m128){0,0,0,0})
#define x264_union128_t x264_union128_sse_t
typedef union { __m128 i; uint64_t q[2]; uint32_t d[4]; uint16_t w[8]; uint8_t b[16]; } MAY_ALIAS x264_union128_sse_t;
#if HAVE_VECTOREXT
typedef uint32_t v4si __attribute__((vector_size (16)));
#endif
#endif // __SSE__
#if HAVE_X86_INLINE_ASM && HAVE_MMX
#define x264_median_mv x264_median_mv_mmx2
static ALWAYS_INLINE void x264_median_mv_mmx2( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
{
asm(
"movd %1, %%mm0 \n"
"movd %2, %%mm1 \n"
"movq %%mm0, %%mm3 \n"
"movd %3, %%mm2 \n"
"pmaxsw %%mm1, %%mm0 \n"
"pminsw %%mm3, %%mm1 \n"
"pminsw %%mm2, %%mm0 \n"
"pmaxsw %%mm1, %%mm0 \n"
"movd %%mm0, %0 \n"
:"=m"(*(x264_union32_t*)dst)
:"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
:"mm0", "mm1", "mm2", "mm3"
);
}
#define x264_predictor_difference x264_predictor_difference_mmx2
static ALWAYS_INLINE int x264_predictor_difference_mmx2( int16_t (*mvc)[2], intptr_t i_mvc )
{
int sum;
static const uint64_t pw_1 = 0x0001000100010001ULL;
asm(
"pxor %%mm4, %%mm4 \n"
"test $1, %1 \n"
"jnz 3f \n"
"movd -8(%2,%1,4), %%mm0 \n"
"movd -4(%2,%1,4), %%mm3 \n"
"psubw %%mm3, %%mm0 \n"
"jmp 2f \n"
"3: \n"
"dec %1 \n"
"1: \n"
"movq -8(%2,%1,4), %%mm0 \n"
"psubw -4(%2,%1,4), %%mm0 \n"
"2: \n"
"sub $2, %1 \n"
"pxor %%mm2, %%mm2 \n"
"psubw %%mm0, %%mm2 \n"
"pmaxsw %%mm2, %%mm0 \n"
"paddusw %%mm0, %%mm4 \n"
"jg 1b \n"
"pmaddwd %4, %%mm4 \n"
"pshufw $14, %%mm4, %%mm0 \n"
"paddd %%mm0, %%mm4 \n"
"movd %%mm4, %0 \n"
:"=r"(sum), "+r"(i_mvc)
:"r"(mvc), "m"(MEM_DYN( mvc, const int16_t )), "m"(pw_1)
:"mm0", "mm2", "mm3", "mm4", "cc"
);
return sum;
}
#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmx2
static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t *mvdtop)
{
static const uint64_t pb_2 = 0x0202020202020202ULL;
static const uint64_t pb_32 = 0x2020202020202020ULL;
static const uint64_t pb_33 = 0x2121212121212121ULL;
int amvd;
asm(
"movd %1, %%mm0 \n"
"movd %2, %%mm1 \n"
"paddusb %%mm1, %%mm0 \n"
"pminub %5, %%mm0 \n"
"pxor %%mm2, %%mm2 \n"
"movq %%mm0, %%mm1 \n"
"pcmpgtb %3, %%mm0 \n"
"pcmpgtb %4, %%mm1 \n"
"psubb %%mm0, %%mm2 \n"
"psubb %%mm1, %%mm2 \n"
"movd %%mm2, %0 \n"
:"=r"(amvd)
:"m"(M16( mvdleft )),"m"(M16( mvdtop )),
"m"(pb_2),"m"(pb_32),"m"(pb_33)
:"mm0", "mm1", "mm2"
);
return (uint16_t)amvd;
}
#define x264_predictor_clip x264_predictor_clip_mmx2
static ALWAYS_INLINE int x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
{
static const uint32_t pd_32 = 0x20;
intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
asm(
"movq (%2), %%mm5 \n"
"movd %6, %%mm3 \n"
"psllw $2, %%mm5 \n" // Convert to subpel
"pshufw $0xEE, %%mm5, %%mm6 \n"
"dec %k3 \n"
"jz 2f \n" // if( i_mvc == 1 ) {do the last iteration}
"punpckldq %%mm3, %%mm3 \n"
"punpckldq %%mm5, %%mm5 \n"
"movd %7, %%mm4 \n"
"lea (%0,%3,4), %3 \n"
"1: \n"
"movq (%0), %%mm0 \n"
"add $8, %0 \n"
"movq %%mm3, %%mm1 \n"
"pxor %%mm2, %%mm2 \n"
"pcmpeqd %%mm0, %%mm1 \n" // mv == pmv
"pcmpeqd %%mm0, %%mm2 \n" // mv == 0
"por %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1
"pmovmskb %%mm2, %k2 \n" // (mv == pmv || mv == 0) * 0xf
"pmaxsw %%mm5, %%mm0 \n"
"pminsw %%mm6, %%mm0 \n"
"pand %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32
"psrlq %%mm2, %%mm0 \n" // drop mv0 if it's skipped
"movq %%mm0, (%5,%4,4) \n"
"and $24, %k2 \n"
"add $2, %4 \n"
"add $8, %k2 \n"
"shr $4, %k2 \n" // (4-val)>>1
"sub %2, %4 \n" // +1 for each valid motion vector
"cmp %3, %0 \n"
"jl 1b \n"
"jg 3f \n" // if( i == i_mvc - 1 ) {do the last iteration}
/* Do the last iteration */
"2: \n"
"movd (%0), %%mm0 \n"
"pxor %%mm2, %%mm2 \n"
"pcmpeqd %%mm0, %%mm3 \n"
"pcmpeqd %%mm0, %%mm2 \n"
"por %%mm3, %%mm2 \n"
"pmovmskb %%mm2, %k2 \n"
"pmaxsw %%mm5, %%mm0 \n"
"pminsw %%mm6, %%mm0 \n"
"movd %%mm0, (%5,%4,4) \n"
"inc %4 \n"
"and $1, %k2 \n"
"sub %2, %4 \n" // output += !(mv == pmv || mv == 0)
"3: \n"
:"+r"(mvc), "=m"(MEM_DYN( dst, int16_t )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
:"r"(dst), "g"(pmv), "m"(pd_32), "m"(MEM_DYN( mvc, const int16_t ))
:"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "cc"
);
return i;
}
/* Same as the above, except we do (mv + 2) >> 2 on the input. */
#define x264_predictor_roundclip x264_predictor_roundclip_mmx2
static ALWAYS_INLINE int x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
{
static const uint64_t pw_2 = 0x0002000200020002ULL;
static const uint32_t pd_32 = 0x20;
intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
asm(
"movq (%2), %%mm5 \n"
"movq %6, %%mm7 \n"
"movd %7, %%mm3 \n"
"pshufw $0xEE, %%mm5, %%mm6 \n"
"dec %k3 \n"
"jz 2f \n"
"punpckldq %%mm3, %%mm3 \n"
"punpckldq %%mm5, %%mm5 \n"
"movd %8, %%mm4 \n"
"lea (%0,%3,4), %3 \n"
"1: \n"
"movq (%0), %%mm0 \n"
"add $8, %0 \n"
"paddw %%mm7, %%mm0 \n"
"psraw $2, %%mm0 \n"
"movq %%mm3, %%mm1 \n"
"pxor %%mm2, %%mm2 \n"
"pcmpeqd %%mm0, %%mm1 \n"
"pcmpeqd %%mm0, %%mm2 \n"
"por %%mm1, %%mm2 \n"
"pmovmskb %%mm2, %k2 \n"
"pmaxsw %%mm5, %%mm0 \n"
"pminsw %%mm6, %%mm0 \n"
"pand %%mm4, %%mm2 \n"
"psrlq %%mm2, %%mm0 \n"
"movq %%mm0, (%5,%4,4) \n"
"and $24, %k2 \n"
"add $2, %4 \n"
"add $8, %k2 \n"
"shr $4, %k2 \n"
"sub %2, %4 \n"
"cmp %3, %0 \n"
"jl 1b \n"
"jg 3f \n"
/* Do the last iteration */
"2: \n"
"movd (%0), %%mm0 \n"
"paddw %%mm7, %%mm0 \n"
"psraw $2, %%mm0 \n"
"pxor %%mm2, %%mm2 \n"
"pcmpeqd %%mm0, %%mm3 \n"
"pcmpeqd %%mm0, %%mm2 \n"
"por %%mm3, %%mm2 \n"
"pmovmskb %%mm2, %k2 \n"
"pmaxsw %%mm5, %%mm0 \n"
"pminsw %%mm6, %%mm0 \n"
"movd %%mm0, (%5,%4,4) \n"
"inc %4 \n"
"and $1, %k2 \n"
"sub %2, %4 \n"
"3: \n"
:"+r"(mvc), "=m"(MEM_DYN( dst, int16_t )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
:"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(MEM_DYN( mvc, const int16_t ))
:"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7", "cc"
);
return i;
}
#endif
#endif

1992
common/x86/x86inc.asm Normal file

File diff suppressed because it is too large Load Diff

937
common/x86/x86util.asm Normal file
View File

@@ -0,0 +1,937 @@
;*****************************************************************************
;* x86util.asm: x86 utility macros
;*****************************************************************************
;* Copyright (C) 2008-2025 x264 project
;*
;* Authors: Holger Lubitz <holger@lubitz.org>
;* Loren Merritt <lorenm@u.washington.edu>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* (at your option) any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
;*
;* This program is also available under a commercial proprietary license.
;* For more information, contact us at licensing@x264.com.
;*****************************************************************************
; like cextern, but with a plain x264 prefix instead of a bitdepth-specific one
%macro cextern_common 1
%xdefine %1 mangle(x264 %+ _ %+ %1)
CAT_XDEFINE cglobaled_, %1, 1
extern %1
%endmacro
%ifndef BIT_DEPTH
%assign BIT_DEPTH 0
%endif
%if BIT_DEPTH > 8
%assign HIGH_BIT_DEPTH 1
%else
%assign HIGH_BIT_DEPTH 0
%endif
%assign FENC_STRIDE 16
%assign FDEC_STRIDE 32
%assign SIZEOF_PIXEL 1
%assign SIZEOF_DCTCOEF 2
%define pixel byte
%define vpbroadcastdct vpbroadcastw
%define vpbroadcastpix vpbroadcastb
%if HIGH_BIT_DEPTH
%assign SIZEOF_PIXEL 2
%assign SIZEOF_DCTCOEF 4
%define pixel word
%define vpbroadcastdct vpbroadcastd
%define vpbroadcastpix vpbroadcastw
%endif
%assign FENC_STRIDEB SIZEOF_PIXEL*FENC_STRIDE
%assign FDEC_STRIDEB SIZEOF_PIXEL*FDEC_STRIDE
%assign PIXEL_MAX ((1 << BIT_DEPTH)-1)
%macro FIX_STRIDES 1-*
%if HIGH_BIT_DEPTH
%rep %0
add %1, %1
%rotate 1
%endrep
%endif
%endmacro
%macro SBUTTERFLY 4
%ifidn %1, dqqq
vperm2i128 m%4, m%2, m%3, q0301 ; punpckh
vinserti128 m%2, m%2, xm%3, 1 ; punpckl
%elif avx_enabled && mmsize >= 16
punpckh%1 m%4, m%2, m%3
punpckl%1 m%2, m%3
%else
mova m%4, m%2
punpckl%1 m%2, m%3
punpckh%1 m%4, m%3
%endif
SWAP %3, %4
%endmacro
%macro SBUTTERFLY2 4
punpckl%1 m%4, m%2, m%3
punpckh%1 m%2, m%2, m%3
SWAP %2, %4, %3
%endmacro
%macro TRANSPOSE4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
SBUTTERFLY dq, %1, %3, %5
SBUTTERFLY dq, %2, %4, %5
SWAP %2, %3
%endmacro
%macro TRANSPOSE2x4x4W 5
SBUTTERFLY wd, %1, %2, %5
SBUTTERFLY wd, %3, %4, %5
SBUTTERFLY dq, %1, %3, %5
SBUTTERFLY dq, %2, %4, %5
SBUTTERFLY qdq, %1, %2, %5
SBUTTERFLY qdq, %3, %4, %5
%endmacro
%macro TRANSPOSE4x4D 5
SBUTTERFLY dq, %1, %2, %5
SBUTTERFLY dq, %3, %4, %5
SBUTTERFLY qdq, %1, %3, %5
SBUTTERFLY qdq, %2, %4, %5
SWAP %2, %3
%endmacro
%macro TRANSPOSE8x8W 9-11
%if ARCH_X86_64
SBUTTERFLY wd, %1, %2, %9
SBUTTERFLY wd, %3, %4, %9
SBUTTERFLY wd, %5, %6, %9
SBUTTERFLY wd, %7, %8, %9
SBUTTERFLY dq, %1, %3, %9
SBUTTERFLY dq, %2, %4, %9
SBUTTERFLY dq, %5, %7, %9
SBUTTERFLY dq, %6, %8, %9
SBUTTERFLY qdq, %1, %5, %9
SBUTTERFLY qdq, %2, %6, %9
SBUTTERFLY qdq, %3, %7, %9
SBUTTERFLY qdq, %4, %8, %9
SWAP %2, %5
SWAP %4, %7
%else
; in: m0..m7, unless %11 in which case m6 is in %9
; out: m0..m7, unless %11 in which case m4 is in %10
; spills into %9 and %10
%if %0<11
movdqa %9, m%7
%endif
SBUTTERFLY wd, %1, %2, %7
movdqa %10, m%2
movdqa m%7, %9
SBUTTERFLY wd, %3, %4, %2
SBUTTERFLY wd, %5, %6, %2
SBUTTERFLY wd, %7, %8, %2
SBUTTERFLY dq, %1, %3, %2
movdqa %9, m%3
movdqa m%2, %10
SBUTTERFLY dq, %2, %4, %3
SBUTTERFLY dq, %5, %7, %3
SBUTTERFLY dq, %6, %8, %3
SBUTTERFLY qdq, %1, %5, %3
SBUTTERFLY qdq, %2, %6, %3
movdqa %10, m%2
movdqa m%3, %9
SBUTTERFLY qdq, %3, %7, %2
SBUTTERFLY qdq, %4, %8, %2
SWAP %2, %5
SWAP %4, %7
%if %0<11
movdqa m%5, %10
%endif
%endif
%endmacro
%macro WIDEN_SXWD 2
punpckhwd m%2, m%1
psrad m%2, 16
%if cpuflag(sse4)
pmovsxwd m%1, m%1
%else
punpcklwd m%1, m%1
psrad m%1, 16
%endif
%endmacro
%macro ABSW 2-3 ; dst, src, tmp (tmp used only if dst==src)
%if cpuflag(ssse3)
pabsw %1, %2
%elifidn %3, sign ; version for pairing with PSIGNW: modifies src
pxor %1, %1
pcmpgtw %1, %2
pxor %2, %1
psubw %2, %1
SWAP %1, %2
%elifidn %1, %2
pxor %3, %3
psubw %3, %1
pmaxsw %1, %3
%elifid %2
pxor %1, %1
psubw %1, %2
pmaxsw %1, %2
%elif %0 == 2
pxor %1, %1
psubw %1, %2
pmaxsw %1, %2
%else
mova %1, %2
pxor %3, %3
psubw %3, %1
pmaxsw %1, %3
%endif
%endmacro
%macro ABSW2 6 ; dst1, dst2, src1, src2, tmp, tmp
%if cpuflag(ssse3)
pabsw %1, %3
pabsw %2, %4
%elifidn %1, %3
pxor %5, %5
pxor %6, %6
psubw %5, %1
psubw %6, %2
pmaxsw %1, %5
pmaxsw %2, %6
%else
pxor %1, %1
pxor %2, %2
psubw %1, %3
psubw %2, %4
pmaxsw %1, %3
pmaxsw %2, %4
%endif
%endmacro
%macro ABSB 2
%if cpuflag(ssse3)
pabsb %1, %1
%else
pxor %2, %2
psubb %2, %1
pminub %1, %2
%endif
%endmacro
%macro ABSD 2-3
%if cpuflag(ssse3)
pabsd %1, %2
%else
%define %%s %2
%if %0 == 3
mova %3, %2
%define %%s %3
%endif
pxor %1, %1
pcmpgtd %1, %%s
pxor %%s, %1
psubd %%s, %1
SWAP %1, %%s
%endif
%endmacro
%macro PSIGN 3-4
%if cpuflag(ssse3) && %0 == 4
psign%1 %2, %3, %4
%elif cpuflag(ssse3)
psign%1 %2, %3
%elif %0 == 4
pxor %2, %3, %4
psub%1 %2, %4
%else
pxor %2, %3
psub%1 %2, %3
%endif
%endmacro
%define PSIGNW PSIGN w,
%define PSIGND PSIGN d,
%macro SPLATB_LOAD 3
%if cpuflag(ssse3)
movd %1, [%2-3]
pshufb %1, %3
%else
movd %1, [%2-3] ;to avoid crossing a cacheline
punpcklbw %1, %1
SPLATW %1, %1, 3
%endif
%endmacro
%imacro SPLATW 2-3 0
%if cpuflag(avx2) && %3 == 0
vpbroadcastw %1, %2
%else
%define %%s %2
%ifid %2
%define %%s xmm%2
%elif %3 == 0
movd xmm%1, %2
%define %%s xmm%1
%endif
PSHUFLW xmm%1, %%s, (%3)*q1111
%if mmsize >= 32
vpbroadcastq %1, xmm%1
%elif mmsize == 16
punpcklqdq %1, %1
%endif
%endif
%endmacro
%imacro SPLATD 2-3 0
%if cpuflag(avx2) && %3 == 0
vpbroadcastd %1, %2
%else
%define %%s %2
%ifid %2
%define %%s xmm%2
%elif %3 == 0
movd xmm%1, %2
%define %%s xmm%1
%endif
%if mmsize == 8 && %3 == 0
%ifidn %1, %%s
punpckldq %1, %1
%else
pshufw %1, %%s, q1010
%endif
%elif mmsize == 8 && %3 == 1
%ifidn %1, %%s
punpckhdq %1, %1
%else
pshufw %1, %%s, q3232
%endif
%else
pshufd xmm%1, %%s, (%3)*q1111
%endif
%if mmsize >= 32
vpbroadcastq %1, xmm%1
%endif
%endif
%endmacro
%macro CLIPW 3 ;(dst, min, max)
pmaxsw %1, %2
pminsw %1, %3
%endmacro
%macro MOVHL 2 ; dst, src
%ifidn %1, %2
punpckhqdq %1, %2
%elif cpuflag(avx)
punpckhqdq %1, %2, %2
%elif cpuflag(sse4)
pshufd %1, %2, q3232 ; pshufd is slow on some older CPUs, so only use it on more modern ones
%else
movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst
%endif
%endmacro
%macro HADDD 2 ; sum junk
%if sizeof%1 >= 64
vextracti32x8 ymm%2, zmm%1, 1
paddd ymm%1, ymm%2
%endif
%if sizeof%1 >= 32
vextracti128 xmm%2, ymm%1, 1
paddd xmm%1, xmm%2
%endif
%if sizeof%1 >= 16
MOVHL xmm%2, xmm%1
paddd xmm%1, xmm%2
%endif
%if cpuflag(xop) && sizeof%1 == 16
vphadddq xmm%1, xmm%1
%else
PSHUFLW xmm%2, xmm%1, q1032
paddd xmm%1, xmm%2
%endif
%endmacro
%macro HADDW 2 ; reg, tmp
%if cpuflag(xop) && sizeof%1 == 16
vphaddwq %1, %1
MOVHL %2, %1
paddd %1, %2
%else
pmaddwd %1, [pw_1]
HADDD %1, %2
%endif
%endmacro
%macro HADDUWD 2
%if cpuflag(xop) && sizeof%1 == 16
vphadduwd %1, %1
%else
psrld %2, %1, 16
pslld %1, 16
psrld %1, 16
paddd %1, %2
%endif
%endmacro
%macro HADDUW 2
%if cpuflag(xop) && sizeof%1 == 16
vphadduwq %1, %1
MOVHL %2, %1
paddd %1, %2
%else
HADDUWD %1, %2
HADDD %1, %2
%endif
%endmacro
%macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp
; AVX2 version uses a precalculated extra input that
; can be re-used across calls
%if sizeof%1==32
; %3 = abcdefgh ijklmnop (lower address)
; %2 = ABCDEFGH IJKLMNOP (higher address)
; vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH
%if %4 < 16
palignr %1, %5, %3, %4 ; %1 = bcdefghi jklmnopA
%else
palignr %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO
%endif
%elif cpuflag(ssse3)
%if %0==5
palignr %1, %2, %3, %4
%else
palignr %1, %2, %3
%endif
%else
%define %%dst %1
%if %0==5
%ifnidn %1, %2
mova %%dst, %2
%endif
%rotate 1
%endif
%ifnidn %4, %2
mova %4, %2
%endif
%if mmsize==8
psllq %%dst, (8-%3)*8
psrlq %4, %3*8
%else
pslldq %%dst, 16-%3
psrldq %4, %3
%endif
por %%dst, %4
%endif
%endmacro
%macro PSHUFLW 1+
%if mmsize == 8
pshufw %1
%else
pshuflw %1
%endif
%endmacro
; shift a mmxreg by n bytes, or a xmmreg by 2*n bytes
; values shifted in are undefined
; faster if dst==src
%define PSLLPIX PSXLPIX l, -1, ;dst, src, shift
%define PSRLPIX PSXLPIX r, 1, ;dst, src, shift
%macro PSXLPIX 5
%if mmsize == 8
%if %5&1
ps%1lq %3, %4, %5*8
%else
pshufw %3, %4, (q3210<<8>>(8+%2*%5))&0xff
%endif
%else
ps%1ldq %3, %4, %5*2
%endif
%endmacro
%macro DEINTB 5 ; mask, reg1, mask, reg2, optional src to fill masks from
%ifnum %5
pand m%3, m%5, m%4 ; src .. y6 .. y4
pand m%1, m%5, m%2 ; dst .. y6 .. y4
%else
mova m%1, %5
pand m%3, m%1, m%4 ; src .. y6 .. y4
pand m%1, m%1, m%2 ; dst .. y6 .. y4
%endif
psrlw m%2, 8 ; dst .. y7 .. y5
psrlw m%4, 8 ; src .. y7 .. y5
%endmacro
%macro SUMSUB_BA 3-4
%if %0==3
padd%1 m%2, m%3
padd%1 m%3, m%3
psub%1 m%3, m%2
%elif avx_enabled
padd%1 m%4, m%2, m%3
psub%1 m%3, m%2
SWAP %2, %4
%else
mova m%4, m%2
padd%1 m%2, m%3
psub%1 m%3, m%4
%endif
%endmacro
%macro SUMSUB_BADC 5-6
%if %0==6
SUMSUB_BA %1, %2, %3, %6
SUMSUB_BA %1, %4, %5, %6
%else
padd%1 m%2, m%3
padd%1 m%4, m%5
padd%1 m%3, m%3
padd%1 m%5, m%5
psub%1 m%3, m%2
psub%1 m%5, m%4
%endif
%endmacro
%macro HADAMARD4_V 4+
SUMSUB_BADC w, %1, %2, %3, %4
SUMSUB_BADC w, %1, %3, %2, %4
%endmacro
%macro HADAMARD8_V 8+
SUMSUB_BADC w, %1, %2, %3, %4
SUMSUB_BADC w, %5, %6, %7, %8
SUMSUB_BADC w, %1, %3, %2, %4
SUMSUB_BADC w, %5, %7, %6, %8
SUMSUB_BADC w, %1, %5, %2, %6
SUMSUB_BADC w, %3, %7, %4, %8
%endmacro
%macro TRANS_SSE2 5-6
; TRANSPOSE2x2
; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq
; %2: ord/unord (for compat with sse4, unused)
; %3/%4: source regs
; %5/%6: tmp regs
%ifidn %1, d
%define mask [mask_10]
%define shift 16
%elifidn %1, q
%define mask [mask_1100]
%define shift 32
%endif
%if %0==6 ; less dependency if we have two tmp
mova m%5, mask ; ff00
mova m%6, m%4 ; x5x4
psll%1 m%4, shift ; x4..
pand m%6, m%5 ; x5..
pandn m%5, m%3 ; ..x0
psrl%1 m%3, shift ; ..x1
por m%4, m%5 ; x4x0
por m%3, m%6 ; x5x1
%else ; more dependency, one insn less. sometimes faster, sometimes not
mova m%5, m%4 ; x5x4
psll%1 m%4, shift ; x4..
pxor m%4, m%3 ; (x4^x1)x0
pand m%4, mask ; (x4^x1)..
pxor m%3, m%4 ; x4x0
psrl%1 m%4, shift ; ..(x1^x4)
pxor m%5, m%4 ; x5x1
SWAP %4, %3, %5
%endif
%endmacro
%macro TRANS_SSE4 5-6 ; see above
%ifidn %1, d
%ifidn %2, ord
psrl%1 m%5, m%3, 16
pblendw m%5, m%4, q2222
psll%1 m%4, 16
pblendw m%4, m%3, q1111
SWAP %3, %5
%else
%if avx_enabled
pblendw m%5, m%3, m%4, q2222
SWAP %3, %5
%else
mova m%5, m%3
pblendw m%3, m%4, q2222
%endif
psll%1 m%4, 16
psrl%1 m%5, 16
por m%4, m%5
%endif
%elifidn %1, q
shufps m%5, m%3, m%4, q3131
shufps m%3, m%3, m%4, q2020
SWAP %4, %5
%endif
%endmacro
%macro TRANS_XOP 5-6
%ifidn %1, d
vpperm m%5, m%3, m%4, [transd_shuf1]
vpperm m%3, m%3, m%4, [transd_shuf2]
%elifidn %1, q
shufps m%5, m%3, m%4, q3131
shufps m%3, m%4, q2020
%endif
SWAP %4, %5
%endmacro
%macro HADAMARD 5-6
; %1=distance in words (0 for vertical pass, 1/2/4 for horizontal passes)
; %2=sumsub/max/amax (sum and diff / maximum / maximum of absolutes)
; %3/%4: regs
; %5(%6): tmpregs
%if %1!=0 ; have to reorder stuff for horizontal op
%ifidn %2, sumsub
%define ORDER ord
; sumsub needs order because a-b != b-a unless a=b
%else
%define ORDER unord
; if we just max, order doesn't matter (allows pblendw+or in sse4)
%endif
%if %1==1
TRANS d, ORDER, %3, %4, %5, %6
%elif %1==2
%if mmsize==8
SBUTTERFLY dq, %3, %4, %5
%elif %0==6
TRANS q, ORDER, %3, %4, %5, %6
%else
TRANS q, ORDER, %3, %4, %5
%endif
%elif %1==4
SBUTTERFLY qdq, %3, %4, %5
%elif %1==8
SBUTTERFLY dqqq, %3, %4, %5
%endif
%endif
%ifidn %2, sumsub
SUMSUB_BA w, %3, %4, %5
%else
%ifidn %2, amax
%if %0==6
ABSW2 m%3, m%4, m%3, m%4, m%5, m%6
%else
ABSW m%3, m%3, m%5
ABSW m%4, m%4, m%5
%endif
%endif
pmaxsw m%3, m%4
%endif
%endmacro
%macro HADAMARD2_2D 6-7 sumsub
HADAMARD 0, sumsub, %1, %2, %5
HADAMARD 0, sumsub, %3, %4, %5
SBUTTERFLY %6, %1, %2, %5
%ifnum %7
HADAMARD 0, amax, %1, %2, %5, %7
%else
HADAMARD 0, %7, %1, %2, %5
%endif
SBUTTERFLY %6, %3, %4, %5
%ifnum %7
HADAMARD 0, amax, %3, %4, %5, %7
%else
HADAMARD 0, %7, %3, %4, %5
%endif
%endmacro
%macro HADAMARD4_2D 5-6 sumsub
HADAMARD2_2D %1, %2, %3, %4, %5, wd
HADAMARD2_2D %1, %3, %2, %4, %5, dq, %6
SWAP %2, %3
%endmacro
%macro HADAMARD4_2D_SSE 5-6 sumsub
HADAMARD 0, sumsub, %1, %2, %5 ; 1st V row 0 + 1
HADAMARD 0, sumsub, %3, %4, %5 ; 1st V row 2 + 3
SBUTTERFLY wd, %1, %2, %5 ; %1: m0 1+0 %2: m1 1+0
SBUTTERFLY wd, %3, %4, %5 ; %3: m0 3+2 %4: m1 3+2
HADAMARD2_2D %1, %3, %2, %4, %5, dq
SBUTTERFLY qdq, %1, %2, %5
HADAMARD 0, %6, %1, %2, %5 ; 2nd H m1/m0 row 0+1
SBUTTERFLY qdq, %3, %4, %5
HADAMARD 0, %6, %3, %4, %5 ; 2nd H m1/m0 row 2+3
%endmacro
%macro HADAMARD8_2D 9-10 sumsub
HADAMARD2_2D %1, %2, %3, %4, %9, wd
HADAMARD2_2D %5, %6, %7, %8, %9, wd
HADAMARD2_2D %1, %3, %2, %4, %9, dq
HADAMARD2_2D %5, %7, %6, %8, %9, dq
HADAMARD2_2D %1, %5, %3, %7, %9, qdq, %10
HADAMARD2_2D %2, %6, %4, %8, %9, qdq, %10
%ifnidn %10, amax
SWAP %2, %5
SWAP %4, %7
%endif
%endmacro
; doesn't include the "pmaddubsw hmul_8p" pass
%macro HADAMARD8_2D_HMUL 10
HADAMARD4_V %1, %2, %3, %4, %9
HADAMARD4_V %5, %6, %7, %8, %9
SUMSUB_BADC w, %1, %5, %2, %6, %9
HADAMARD 2, sumsub, %1, %5, %9, %10
HADAMARD 2, sumsub, %2, %6, %9, %10
SUMSUB_BADC w, %3, %7, %4, %8, %9
HADAMARD 2, sumsub, %3, %7, %9, %10
HADAMARD 2, sumsub, %4, %8, %9, %10
HADAMARD 1, amax, %1, %5, %9, %10
HADAMARD 1, amax, %2, %6, %9, %5
HADAMARD 1, amax, %3, %7, %9, %5
HADAMARD 1, amax, %4, %8, %9, %5
%endmacro
%macro SUMSUB2_AB 4
%if cpuflag(xop)
pmacs%1%1 m%4, m%3, [p%1_m2], m%2
pmacs%1%1 m%2, m%2, [p%1_2], m%3
%elifnum %3
psub%1 m%4, m%2, m%3
psub%1 m%4, m%3
padd%1 m%2, m%2
padd%1 m%2, m%3
%else
mova m%4, m%2
padd%1 m%2, m%2
padd%1 m%2, %3
psub%1 m%4, %3
psub%1 m%4, %3
%endif
%endmacro
%macro SUMSUBD2_AB 5
%ifnum %4
psra%1 m%5, m%2, 1 ; %3: %3>>1
psra%1 m%4, m%3, 1 ; %2: %2>>1
padd%1 m%4, m%2 ; %3: %3>>1+%2
psub%1 m%5, m%3 ; %2: %2>>1-%3
SWAP %2, %5
SWAP %3, %4
%else
mova %5, m%2
mova %4, m%3
psra%1 m%3, 1 ; %3: %3>>1
psra%1 m%2, 1 ; %2: %2>>1
padd%1 m%3, %5 ; %3: %3>>1+%2
psub%1 m%2, %4 ; %2: %2>>1-%3
%endif
%endmacro
%macro DCT4_1D 5
%ifnum %5
SUMSUB_BADC w, %4, %1, %3, %2, %5
SUMSUB_BA w, %3, %4, %5
SUMSUB2_AB w, %1, %2, %5
SWAP %1, %3, %4, %5, %2
%else
SUMSUB_BADC w, %4, %1, %3, %2
SUMSUB_BA w, %3, %4
mova [%5], m%2
SUMSUB2_AB w, %1, [%5], %2
SWAP %1, %3, %4, %2
%endif
%endmacro
%macro IDCT4_1D 6-7
%ifnum %6
SUMSUBD2_AB %1, %3, %5, %7, %6
; %3: %3>>1-%5 %5: %3+%5>>1
SUMSUB_BA %1, %4, %2, %7
; %4: %2+%4 %2: %2-%4
SUMSUB_BADC %1, %5, %4, %3, %2, %7
; %5: %2+%4 + (%3+%5>>1)
; %4: %2+%4 - (%3+%5>>1)
; %3: %2-%4 + (%3>>1-%5)
; %2: %2-%4 - (%3>>1-%5)
%else
%ifidn %1, w
SUMSUBD2_AB %1, %3, %5, [%6], [%6+16]
%else
SUMSUBD2_AB %1, %3, %5, [%6], [%6+32]
%endif
SUMSUB_BA %1, %4, %2
SUMSUB_BADC %1, %5, %4, %3, %2
%endif
SWAP %2, %5, %4
; %2: %2+%4 + (%3+%5>>1) row0
; %3: %2-%4 + (%3>>1-%5) row1
; %4: %2-%4 - (%3>>1-%5) row2
; %5: %2+%4 - (%3+%5>>1) row3
%endmacro
%macro LOAD_DIFF 5-6 1
%if HIGH_BIT_DEPTH
%if %6 ; %5 aligned?
mova %1, %4
psubw %1, %5
%elif cpuflag(avx)
movu %1, %4
psubw %1, %5
%else
movu %1, %4
movu %2, %5
psubw %1, %2
%endif
%else ; !HIGH_BIT_DEPTH
movh %1, %4
movh %2, %5
%ifidn %3, none
punpcklbw %1, %2
punpcklbw %2, %2
%else
punpcklbw %1, %3
punpcklbw %2, %3
%endif
psubw %1, %2
%endif ; HIGH_BIT_DEPTH
%endmacro
%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
%if BIT_DEPTH == 8 && cpuflag(ssse3)
movh m%2, [%8+%1*FDEC_STRIDE]
movh m%1, [%7+%1*FENC_STRIDE]
punpcklbw m%1, m%2
movh m%3, [%8+%2*FDEC_STRIDE]
movh m%2, [%7+%2*FENC_STRIDE]
punpcklbw m%2, m%3
movh m%4, [%8+%3*FDEC_STRIDE]
movh m%3, [%7+%3*FENC_STRIDE]
punpcklbw m%3, m%4
movh m%5, [%8+%4*FDEC_STRIDE]
movh m%4, [%7+%4*FENC_STRIDE]
punpcklbw m%4, m%5
pmaddubsw m%1, m%6
pmaddubsw m%2, m%6
pmaddubsw m%3, m%6
pmaddubsw m%4, m%6
%else
LOAD_DIFF m%1, m%5, m%6, [%7+%1*FENC_STRIDEB], [%8+%1*FDEC_STRIDEB]
LOAD_DIFF m%2, m%5, m%6, [%7+%2*FENC_STRIDEB], [%8+%2*FDEC_STRIDEB]
LOAD_DIFF m%3, m%5, m%6, [%7+%3*FENC_STRIDEB], [%8+%3*FDEC_STRIDEB]
LOAD_DIFF m%4, m%5, m%6, [%7+%4*FENC_STRIDEB], [%8+%4*FDEC_STRIDEB]
%endif
%endmacro
%macro STORE_DCT 6
movq [%5+%6+ 0], m%1
movq [%5+%6+ 8], m%2
movq [%5+%6+16], m%3
movq [%5+%6+24], m%4
movhps [%5+%6+32], m%1
movhps [%5+%6+40], m%2
movhps [%5+%6+48], m%3
movhps [%5+%6+56], m%4
%endmacro
%macro STORE_IDCT 4
movhps [r0-4*FDEC_STRIDE], %1
movh [r0-3*FDEC_STRIDE], %1
movhps [r0-2*FDEC_STRIDE], %2
movh [r0-1*FDEC_STRIDE], %2
movhps [r0+0*FDEC_STRIDE], %3
movh [r0+1*FDEC_STRIDE], %3
movhps [r0+2*FDEC_STRIDE], %4
movh [r0+3*FDEC_STRIDE], %4
%endmacro
%macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned?
LOAD_DIFF m%1, m%5, m%7, [%8], [%9], %11
LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3], %11
LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11
LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5], %11
%if %10
lea %8, [%8+4*r1]
lea %9, [%9+4*r3]
%endif
%endmacro
; 2xdst, 2xtmp, 2xsrcrow
%macro LOAD_DIFF16x2_AVX2 6
pmovzxbw m%1, [r1+%5*FENC_STRIDE]
pmovzxbw m%2, [r1+%6*FENC_STRIDE]
pmovzxbw m%3, [r2+(%5-4)*FDEC_STRIDE]
pmovzxbw m%4, [r2+(%6-4)*FDEC_STRIDE]
psubw m%1, m%3
psubw m%2, m%4
%endmacro
%macro DIFFx2 6-7
movh %3, %5
punpcklbw %3, %4
psraw %1, 6
paddsw %1, %3
movh %3, %6
punpcklbw %3, %4
psraw %2, 6
paddsw %2, %3
packuswb %2, %1
%endmacro
; (high depth) in: %1, %2, min to clip, max to clip, mem128
; in: %1, tmp, %3, mem64
%macro STORE_DIFF 4-5
%if HIGH_BIT_DEPTH
psrad %1, 6
psrad %2, 6
packssdw %1, %2
paddw %1, %5
CLIPW %1, %3, %4
mova %5, %1
%else
movh %2, %4
punpcklbw %2, %3
psraw %1, 6
paddsw %1, %2
packuswb %1, %1
movh %4, %1
%endif
%endmacro
%macro SHUFFLE_MASK_W 8
%rep 8
%if %1>=0x80
db %1, %1
%else
db %1*2
db %1*2+1
%endif
%rotate 1
%endrep
%endmacro
; instruction, accum, input, iteration (zero to swap, nonzero to add)
%macro ACCUM 4
%if %4
%1 m%2, m%3
%else
SWAP %2, %3
%endif
%endmacro