x264 source for verification 2026-05-22
This commit is contained in:
526
common/mips/dct-c.c
Normal file
526
common/mips/dct-c.c
Normal file
@@ -0,0 +1,526 @@
|
||||
/*****************************************************************************
|
||||
* dct-c.c: msa transform and zigzag
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2015-2025 x264 project
|
||||
*
|
||||
* Authors: Rishikesh More <rishikesh.more@imgtec.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common/common.h"
|
||||
#include "macros.h"
|
||||
#include "dct.h"
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
#define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 ) \
|
||||
{ \
|
||||
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
|
||||
\
|
||||
tmp0_m = in0 + in2; \
|
||||
tmp1_m = in0 - in2; \
|
||||
tmp2_m = in1 >> 1; \
|
||||
tmp2_m = tmp2_m - in3; \
|
||||
tmp3_m = in3 >> 1; \
|
||||
tmp3_m = in1 + tmp3_m; \
|
||||
\
|
||||
BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 ); \
|
||||
}
|
||||
|
||||
static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
|
||||
int32_t i_src_stride )
|
||||
{
|
||||
v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3;
|
||||
v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
|
||||
v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
|
||||
v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;
|
||||
|
||||
LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
|
||||
UNPCK_R_SH_SW( src0, src0_r );
|
||||
UNPCK_R_SH_SW( src1, src1_r );
|
||||
UNPCK_R_SH_SW( src2, src2_r );
|
||||
UNPCK_R_SH_SW( src3, src3_r );
|
||||
BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r,
|
||||
tmp0, tmp3, tmp2, tmp1 );
|
||||
BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
|
||||
hor_res0, hor_res3, hor_res2, hor_res1 );
|
||||
TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3,
|
||||
hor_res0, hor_res1, hor_res2, hor_res3 );
|
||||
BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
|
||||
tmp0, tmp3, tmp2, tmp1 );
|
||||
BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
|
||||
ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
|
||||
SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 );
|
||||
PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r,
|
||||
ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r,
|
||||
ver_res0, ver_res1, ver_res2, ver_res3 );
|
||||
PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 );
|
||||
ST_SH2( ver_res0, ver_res2, p_dst, 8 );
|
||||
}
|
||||
|
||||
static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride,
|
||||
uint8_t *p_ref, int32_t i_dst_stride,
|
||||
int16_t *p_dst )
|
||||
{
|
||||
uint32_t i_src0, i_src1, i_src2, i_src3;
|
||||
uint32_t i_ref0, i_ref1, i_ref2, i_ref3;
|
||||
v16i8 src = { 0 };
|
||||
v16i8 ref = { 0 };
|
||||
v16u8 inp0, inp1;
|
||||
v8i16 diff0, diff1, diff2, diff3;
|
||||
v8i16 temp0, temp1, temp2, temp3;
|
||||
|
||||
LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
|
||||
LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 );
|
||||
|
||||
INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
|
||||
INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref );
|
||||
|
||||
ILVRL_B2_UB( src, ref, inp0, inp1 );
|
||||
|
||||
HSUB_UB2_SH( inp0, inp1, diff0, diff2 );
|
||||
|
||||
diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 );
|
||||
diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 );
|
||||
|
||||
BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 );
|
||||
|
||||
diff0 = temp0 + temp1;
|
||||
diff1 = ( temp3 << 1 ) + temp2;
|
||||
diff2 = temp0 - temp1;
|
||||
diff3 = temp3 - ( temp2 << 1 );
|
||||
|
||||
TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
|
||||
temp0, temp1, temp2, temp3 );
|
||||
BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 );
|
||||
|
||||
temp0 = diff0 + diff1;
|
||||
temp1 = ( diff3 << 1 ) + diff2;
|
||||
temp2 = diff0 - diff1;
|
||||
temp3 = diff3 - ( diff2 << 1 );
|
||||
|
||||
ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 );
|
||||
ST_UB2( inp0, inp1, p_dst, 8 );
|
||||
}
|
||||
|
||||
static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16],
|
||||
int16_t pi_level[16] )
|
||||
{
|
||||
v8i16 src0, src1;
|
||||
v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 };
|
||||
v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 };
|
||||
|
||||
LD_SH2( pi_dct, 8, src0, src1 );
|
||||
VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 );
|
||||
ST_SH2( mask0, mask1, pi_level, 8 );
|
||||
}
|
||||
|
||||
static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src,
|
||||
int32_t i_dst_stride )
|
||||
{
|
||||
v8i16 src0, src1, src2, src3;
|
||||
v8i16 hres0, hres1, hres2, hres3;
|
||||
v8i16 vres0, vres1, vres2, vres3;
|
||||
v8i16 zeros = { 0 };
|
||||
|
||||
LD4x4_SH( p_src, src0, src1, src2, src3 );
|
||||
AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 );
|
||||
TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3,
|
||||
hres0, hres1, hres2, hres3 );
|
||||
AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 );
|
||||
SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 );
|
||||
ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride );
|
||||
ST_SH2( zeros, zeros, p_src, 8 );
|
||||
}
|
||||
|
||||
static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src,
|
||||
int32_t i_dst_stride )
|
||||
{
|
||||
int16_t i_dc;
|
||||
uint32_t i_src0, i_src1, i_src2, i_src3;
|
||||
v16u8 pred = { 0 };
|
||||
v16i8 out;
|
||||
v8i16 input_dc, pred_r, pred_l;
|
||||
|
||||
i_dc = ( p_src[0] + 32 ) >> 6;
|
||||
input_dc = __msa_fill_h( i_dc );
|
||||
p_src[ 0 ] = 0;
|
||||
|
||||
LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 );
|
||||
INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred );
|
||||
UNPCK_UB_SH( pred, pred_r, pred_l );
|
||||
|
||||
pred_r += input_dc;
|
||||
pred_l += input_dc;
|
||||
|
||||
CLIP_SH2_0_255( pred_r, pred_l );
|
||||
out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r );
|
||||
ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride );
|
||||
}
|
||||
|
||||
static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
|
||||
int32_t i_dst_stride )
|
||||
{
|
||||
v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v8i16 vec0, vec1, vec2, vec3;
|
||||
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
|
||||
v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
|
||||
v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
|
||||
v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
|
||||
v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
|
||||
v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
|
||||
v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
|
||||
v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||
v16i8 zeros = { 0 };
|
||||
|
||||
p_src[ 0 ] += 32;
|
||||
|
||||
LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
|
||||
|
||||
vec0 = src0 + src4;
|
||||
vec1 = src0 - src4;
|
||||
vec2 = src2 >> 1;
|
||||
vec2 = vec2 - src6;
|
||||
vec3 = src6 >> 1;
|
||||
vec3 = src2 + vec3;
|
||||
|
||||
BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );
|
||||
|
||||
vec0 = src7 >> 1;
|
||||
vec0 = src5 - vec0 - src3 - src7;
|
||||
vec1 = src3 >> 1;
|
||||
vec1 = src1 - vec1 + src7 - src3;
|
||||
vec2 = src5 >> 1;
|
||||
vec2 = vec2 - src1 + src7 + src5;
|
||||
vec3 = src1 >> 1;
|
||||
vec3 = vec3 + src3 + src5 + src1;
|
||||
tmp4 = vec3 >> 2;
|
||||
tmp4 += vec0;
|
||||
tmp5 = vec2 >> 2;
|
||||
tmp5 += vec1;
|
||||
tmp6 = vec1 >> 2;
|
||||
tmp6 -= vec2;
|
||||
tmp7 = vec0 >> 2;
|
||||
tmp7 = vec3 - tmp7;
|
||||
|
||||
BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
|
||||
res0, res1, res2, res3, res4, res5, res6, res7 );
|
||||
TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
|
||||
res0, res1, res2, res3, res4, res5, res6, res7 );
|
||||
UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
|
||||
UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
|
||||
UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
|
||||
UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
|
||||
UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
|
||||
UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
|
||||
UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
|
||||
UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
|
||||
BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
|
||||
vec0_r, vec0_l, vec1_l, vec1_r );
|
||||
|
||||
vec2_r = tmp2_r >> 1;
|
||||
vec2_l = tmp2_l >> 1;
|
||||
vec2_r -= tmp6_r;
|
||||
vec2_l -= tmp6_l;
|
||||
vec3_r = tmp6_r >> 1;
|
||||
vec3_l = tmp6_l >> 1;
|
||||
vec3_r += tmp2_r;
|
||||
vec3_l += tmp2_l;
|
||||
|
||||
BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
|
||||
tmp0_r, tmp2_r, tmp4_r, tmp6_r );
|
||||
BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
|
||||
tmp0_l, tmp2_l, tmp4_l, tmp6_l );
|
||||
|
||||
vec0_r = tmp7_r >> 1;
|
||||
vec0_l = tmp7_l >> 1;
|
||||
vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
|
||||
vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
|
||||
vec1_r = tmp3_r >> 1;
|
||||
vec1_l = tmp3_l >> 1;
|
||||
vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
|
||||
vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
|
||||
vec2_r = tmp5_r >> 1;
|
||||
vec2_l = tmp5_l >> 1;
|
||||
vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
|
||||
vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
|
||||
vec3_r = tmp1_r >> 1;
|
||||
vec3_l = tmp1_l >> 1;
|
||||
vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
|
||||
vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
|
||||
tmp1_r = vec3_r >> 2;
|
||||
tmp1_l = vec3_l >> 2;
|
||||
tmp1_r += vec0_r;
|
||||
tmp1_l += vec0_l;
|
||||
tmp3_r = vec2_r >> 2;
|
||||
tmp3_l = vec2_l >> 2;
|
||||
tmp3_r += vec1_r;
|
||||
tmp3_l += vec1_l;
|
||||
tmp5_r = vec1_r >> 2;
|
||||
tmp5_l = vec1_l >> 2;
|
||||
tmp5_r -= vec2_r;
|
||||
tmp5_l -= vec2_l;
|
||||
tmp7_r = vec0_r >> 2;
|
||||
tmp7_l = vec0_l >> 2;
|
||||
tmp7_r = vec3_r - tmp7_r;
|
||||
tmp7_l = vec3_l - tmp7_l;
|
||||
|
||||
BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
|
||||
res0_r, res0_l, res7_l, res7_r );
|
||||
BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
|
||||
res1_r, res1_l, res6_l, res6_r );
|
||||
BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
|
||||
res2_r, res2_l, res5_l, res5_r );
|
||||
BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
|
||||
res3_r, res3_l, res4_l, res4_r );
|
||||
SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
|
||||
SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
|
||||
SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
|
||||
SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
|
||||
PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
|
||||
res0, res1, res2, res3 );
|
||||
PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
|
||||
res4, res5, res6, res7 );
|
||||
LD_SB8( p_dst, i_dst_stride,
|
||||
dst0, dst1, dst2, dst3,
|
||||
dst4, dst5, dst6, dst7 );
|
||||
ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
|
||||
tmp0, tmp1, tmp2, tmp3 );
|
||||
ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
|
||||
tmp4, tmp5, tmp6, tmp7 );
|
||||
ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
|
||||
res0, res1, res2, res3 );
|
||||
ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
|
||||
res4, res5, res6, res7 );
|
||||
CLIP_SH4_0_255( res0, res1, res2, res3 );
|
||||
CLIP_SH4_0_255( res4, res5, res6, res7 );
|
||||
PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
|
||||
dst0, dst1, dst2, dst3 );
|
||||
ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
|
||||
p_dst += ( 4 * i_dst_stride );
|
||||
ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
|
||||
}
|
||||
|
||||
static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride,
|
||||
int16_t *p_dst, int32_t i_dst_stride )
|
||||
{
|
||||
v8i16 src0, src1, src2, src3;
|
||||
v4i32 src0_r, src1_r, src2_r, src3_r;
|
||||
v4i32 hres0, hres1, hres2, hres3;
|
||||
v8i16 vres0, vres1, vres2, vres3;
|
||||
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
|
||||
v2i64 res0, res1;
|
||||
|
||||
LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
|
||||
UNPCK_R_SH_SW( src0, src0_r );
|
||||
UNPCK_R_SH_SW( src1, src1_r );
|
||||
UNPCK_R_SH_SW( src2, src2_r );
|
||||
UNPCK_R_SH_SW( src3, src3_r );
|
||||
BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 );
|
||||
BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 );
|
||||
TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3,
|
||||
hres0, hres1, hres2, hres3 );
|
||||
BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 );
|
||||
BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 );
|
||||
PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
|
||||
vres0, vres1, vres2, vres3 );
|
||||
PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 );
|
||||
ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 );
|
||||
}
|
||||
|
||||
static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride,
|
||||
uint8_t *pred_ptr, int32_t i_pred_stride )
|
||||
{
|
||||
int16_t i_sum;
|
||||
uint32_t i_src0, i_src1, i_src2, i_src3;
|
||||
uint32_t i_pred0, i_pred1, i_pred2, i_pred3;
|
||||
v16i8 src = { 0 };
|
||||
v16i8 pred = { 0 };
|
||||
v16u8 src_l0, src_l1;
|
||||
v8i16 diff0, diff1;
|
||||
|
||||
LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
|
||||
LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 );
|
||||
INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
|
||||
INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred );
|
||||
ILVRL_B2_UB( src, pred, src_l0, src_l1 );
|
||||
HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 );
|
||||
i_sum = HADD_UH_U32( diff0 + diff1 );
|
||||
|
||||
return i_sum;
|
||||
}
|
||||
|
||||
void x264_dct4x4dc_msa( int16_t d[16] )
|
||||
{
|
||||
avc_dct4x4dc_msa( d, d, 4 );
|
||||
}
|
||||
|
||||
void x264_idct4x4dc_msa( int16_t d[16] )
|
||||
{
|
||||
avc_idct4x4dc_msa( d, 4, d, 4 );
|
||||
}
|
||||
|
||||
void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] )
|
||||
{
|
||||
avc_idct4x4_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] )
|
||||
{
|
||||
avc_idct4x4_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
|
||||
avc_idct4x4_addblk_msa( &p_dst[4], &pi_dct[1][0], FDEC_STRIDE );
|
||||
avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 0],
|
||||
&pi_dct[2][0], FDEC_STRIDE );
|
||||
avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 4],
|
||||
&pi_dct[3][0], FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] )
|
||||
{
|
||||
x264_add8x8_idct_msa( &p_dst[0], &pi_dct[0] );
|
||||
x264_add8x8_idct_msa( &p_dst[8], &pi_dct[4] );
|
||||
x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 0], &pi_dct[8] );
|
||||
x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[12] );
|
||||
}
|
||||
|
||||
void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] )
|
||||
{
|
||||
avc_idct8_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] )
|
||||
{
|
||||
avc_idct8_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
|
||||
avc_idct8_addblk_msa( &p_dst[8], &pi_dct[1][0], FDEC_STRIDE );
|
||||
avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 0],
|
||||
&pi_dct[2][0], FDEC_STRIDE );
|
||||
avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 8],
|
||||
&pi_dct[3][0], FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] )
|
||||
{
|
||||
avc_idct4x4_addblk_dc_msa( &p_dst[0], &pi_dct[0], FDEC_STRIDE );
|
||||
avc_idct4x4_addblk_dc_msa( &p_dst[4], &pi_dct[1], FDEC_STRIDE );
|
||||
avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 0],
|
||||
&pi_dct[2], FDEC_STRIDE );
|
||||
avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 4],
|
||||
&pi_dct[3], FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] )
|
||||
{
|
||||
for( int32_t i = 0; i < 4; i++, pi_dct += 4, p_dst += 4 * FDEC_STRIDE )
|
||||
{
|
||||
avc_idct4x4_addblk_dc_msa( &p_dst[ 0], &pi_dct[0], FDEC_STRIDE );
|
||||
avc_idct4x4_addblk_dc_msa( &p_dst[ 4], &pi_dct[1], FDEC_STRIDE );
|
||||
avc_idct4x4_addblk_dc_msa( &p_dst[ 8], &pi_dct[2], FDEC_STRIDE );
|
||||
avc_idct4x4_addblk_dc_msa( &p_dst[12], &pi_dct[3], FDEC_STRIDE );
|
||||
}
|
||||
}
|
||||
|
||||
void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src,
|
||||
uint8_t *p_ref )
|
||||
{
|
||||
avc_sub4x4_dct_msa( p_src, FENC_STRIDE, p_ref, FDEC_STRIDE, p_dst );
|
||||
}
|
||||
|
||||
void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
|
||||
uint8_t *p_ref )
|
||||
{
|
||||
avc_sub4x4_dct_msa( &p_src[0], FENC_STRIDE,
|
||||
&p_ref[0], FDEC_STRIDE, p_dst[0] );
|
||||
avc_sub4x4_dct_msa( &p_src[4], FENC_STRIDE, &p_ref[4],
|
||||
FDEC_STRIDE, p_dst[1] );
|
||||
avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 0],
|
||||
FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 0],
|
||||
FDEC_STRIDE, p_dst[2] );
|
||||
avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 4],
|
||||
FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 4],
|
||||
FDEC_STRIDE, p_dst[3] );
|
||||
}
|
||||
|
||||
void x264_sub16x16_dct_msa( int16_t p_dst[16][16],
|
||||
uint8_t *p_src,
|
||||
uint8_t *p_ref )
|
||||
{
|
||||
x264_sub8x8_dct_msa( &p_dst[ 0], &p_src[0], &p_ref[0] );
|
||||
x264_sub8x8_dct_msa( &p_dst[ 4], &p_src[8], &p_ref[8] );
|
||||
x264_sub8x8_dct_msa( &p_dst[ 8], &p_src[8 * FENC_STRIDE + 0],
|
||||
&p_ref[8*FDEC_STRIDE+0] );
|
||||
x264_sub8x8_dct_msa( &p_dst[12], &p_src[8 * FENC_STRIDE + 8],
|
||||
&p_ref[8*FDEC_STRIDE+8] );
|
||||
}
|
||||
|
||||
void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4],
|
||||
uint8_t *p_pix1, uint8_t *p_pix2 )
|
||||
{
|
||||
int32_t d0, d1, d2, d3;
|
||||
|
||||
pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE,
|
||||
&p_pix2[0], FDEC_STRIDE );
|
||||
pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE,
|
||||
&p_pix2[4], FDEC_STRIDE );
|
||||
pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE,
|
||||
&p_pix2[4 * FDEC_STRIDE + 0],
|
||||
FDEC_STRIDE );
|
||||
pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE,
|
||||
&p_pix2[4 * FDEC_STRIDE + 4],
|
||||
FDEC_STRIDE );
|
||||
|
||||
BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 );
|
||||
BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] );
|
||||
}
|
||||
|
||||
void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8],
|
||||
uint8_t *p_pix1, uint8_t *p_pix2 )
|
||||
{
|
||||
int32_t a0, a1, a2, a3, a4, a5, a6, a7;
|
||||
int32_t b0, b1, b2, b3, b4, b5, b6, b7;
|
||||
|
||||
a0 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 0], FENC_STRIDE,
|
||||
&p_pix2[ 0 * FDEC_STRIDE + 0], FDEC_STRIDE );
|
||||
a1 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 4], FENC_STRIDE,
|
||||
&p_pix2[ 0 * FDEC_STRIDE + 4], FDEC_STRIDE );
|
||||
a2 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 0], FENC_STRIDE,
|
||||
&p_pix2[ 4 * FDEC_STRIDE + 0], FDEC_STRIDE );
|
||||
a3 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 4], FENC_STRIDE,
|
||||
&p_pix2[ 4 * FDEC_STRIDE + 4], FDEC_STRIDE );
|
||||
a4 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 0], FENC_STRIDE,
|
||||
&p_pix2[ 8 * FDEC_STRIDE + 0], FDEC_STRIDE );
|
||||
a5 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 4], FENC_STRIDE,
|
||||
&p_pix2[ 8 * FDEC_STRIDE + 4], FDEC_STRIDE );
|
||||
a6 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 0], FENC_STRIDE,
|
||||
&p_pix2[12 * FDEC_STRIDE + 0], FDEC_STRIDE );
|
||||
a7 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 4], FENC_STRIDE,
|
||||
&p_pix2[12 * FDEC_STRIDE + 4], FDEC_STRIDE );
|
||||
|
||||
BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
|
||||
b0, b1, b2, b3, b7, b6, b5, b4 );
|
||||
BUTTERFLY_8( b0, b2, b4, b6, b7, b5, b3, b1,
|
||||
a0, a1, a2, a3, a7, a6, a5, a4 );
|
||||
BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
|
||||
pi_dct[0], pi_dct[1], pi_dct[6], pi_dct[7],
|
||||
pi_dct[5], pi_dct[4], pi_dct[3], pi_dct[2] );
|
||||
}
|
||||
|
||||
void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] )
|
||||
{
|
||||
avc_zigzag_scan_4x4_frame_msa( pi_dct, pi_level );
|
||||
}
|
||||
#endif
|
||||
64
common/mips/dct.h
Normal file
64
common/mips/dct.h
Normal file
@@ -0,0 +1,64 @@
|
||||
/*****************************************************************************
|
||||
* dct.h: msa transform and zigzag
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2015-2025 x264 project
|
||||
*
|
||||
* Authors: Rishikesh More <rishikesh.more@imgtec.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_MIPS_DCT_H
|
||||
#define X264_MIPS_DCT_H
|
||||
|
||||
#define x264_dct4x4dc_msa x264_template(dct4x4dc_msa)
|
||||
void x264_dct4x4dc_msa( int16_t d[16] );
|
||||
#define x264_idct4x4dc_msa x264_template(idct4x4dc_msa)
|
||||
void x264_idct4x4dc_msa( int16_t d[16] );
|
||||
#define x264_add4x4_idct_msa x264_template(add4x4_idct_msa)
|
||||
void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] );
|
||||
#define x264_add8x8_idct_msa x264_template(add8x8_idct_msa)
|
||||
void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] );
|
||||
#define x264_add16x16_idct_msa x264_template(add16x16_idct_msa)
|
||||
void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] );
|
||||
#define x264_add8x8_idct8_msa x264_template(add8x8_idct8_msa)
|
||||
void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] );
|
||||
#define x264_add16x16_idct8_msa x264_template(add16x16_idct8_msa)
|
||||
void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] );
|
||||
#define x264_add8x8_idct_dc_msa x264_template(add8x8_idct_dc_msa)
|
||||
void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] );
|
||||
#define x264_add16x16_idct_dc_msa x264_template(add16x16_idct_dc_msa)
|
||||
void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] );
|
||||
#define x264_sub4x4_dct_msa x264_template(sub4x4_dct_msa)
|
||||
void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src, uint8_t *p_ref );
|
||||
#define x264_sub8x8_dct_msa x264_template(sub8x8_dct_msa)
|
||||
void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
|
||||
uint8_t *p_ref );
|
||||
#define x264_sub16x16_dct_msa x264_template(sub16x16_dct_msa)
|
||||
void x264_sub16x16_dct_msa( int16_t p_dst[16][16], uint8_t *p_src,
|
||||
uint8_t *p_ref );
|
||||
#define x264_sub8x8_dct_dc_msa x264_template(sub8x8_dct_dc_msa)
|
||||
void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4], uint8_t *p_pix1,
|
||||
uint8_t *p_pix2 );
|
||||
#define x264_sub8x16_dct_dc_msa x264_template(sub8x16_dct_dc_msa)
|
||||
void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], uint8_t *p_pix1,
|
||||
uint8_t *p_pix2 );
|
||||
#define x264_zigzag_scan_4x4_frame_msa x264_template(zigzag_scan_4x4_frame_msa)
|
||||
void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] );
|
||||
|
||||
#endif
|
||||
2011
common/mips/deblock-c.c
Normal file
2011
common/mips/deblock-c.c
Normal file
File diff suppressed because it is too large
Load Diff
52
common/mips/deblock.h
Normal file
52
common/mips/deblock.h
Normal file
@@ -0,0 +1,52 @@
|
||||
/*****************************************************************************
|
||||
* deblock.h: msa deblocking
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2017-2025 x264 project
|
||||
*
|
||||
* Authors: Anton Mitrofanov <BugMaster@narod.ru>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_MIPS_DEBLOCK_H
|
||||
#define X264_MIPS_DEBLOCK_H
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
#define x264_deblock_v_luma_msa x264_template(deblock_v_luma_msa)
|
||||
void x264_deblock_v_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_luma_msa x264_template(deblock_h_luma_msa)
|
||||
void x264_deblock_h_luma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_v_chroma_msa x264_template(deblock_v_chroma_msa)
|
||||
void x264_deblock_v_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_h_chroma_msa x264_template(deblock_h_chroma_msa)
|
||||
void x264_deblock_h_chroma_msa( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
|
||||
#define x264_deblock_v_luma_intra_msa x264_template(deblock_v_luma_intra_msa)
|
||||
void x264_deblock_v_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_luma_intra_msa x264_template(deblock_h_luma_intra_msa)
|
||||
void x264_deblock_h_luma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_v_chroma_intra_msa x264_template(deblock_v_chroma_intra_msa)
|
||||
void x264_deblock_v_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_h_chroma_intra_msa x264_template(deblock_h_chroma_intra_msa)
|
||||
void x264_deblock_h_chroma_intra_msa( uint8_t *pix, intptr_t stride, int alpha, int beta );
|
||||
#define x264_deblock_strength_msa x264_template(deblock_strength_msa)
|
||||
void x264_deblock_strength_msa( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
|
||||
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
|
||||
int bframe );
|
||||
#endif
|
||||
|
||||
#endif
|
||||
1952
common/mips/macros.h
Normal file
1952
common/mips/macros.h
Normal file
File diff suppressed because it is too large
Load Diff
3696
common/mips/mc-c.c
Normal file
3696
common/mips/mc-c.c
Normal file
File diff suppressed because it is too large
Load Diff
32
common/mips/mc.h
Normal file
32
common/mips/mc.h
Normal file
@@ -0,0 +1,32 @@
|
||||
/*****************************************************************************
|
||||
* mc.h: msa motion compensation
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2015-2025 x264 project
|
||||
*
|
||||
* Authors: Neha Rana <neha.rana@imgtec.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_MIPS_MC_H
|
||||
#define X264_MIPS_MC_H
|
||||
|
||||
#define x264_mc_init_mips x264_template(mc_init_mips)
|
||||
void x264_mc_init_mips( uint32_t cpu, x264_mc_functions_t *pf );
|
||||
|
||||
#endif
|
||||
1491
common/mips/pixel-c.c
Normal file
1491
common/mips/pixel-c.c
Normal file
File diff suppressed because it is too large
Load Diff
228
common/mips/pixel.h
Normal file
228
common/mips/pixel.h
Normal file
@@ -0,0 +1,228 @@
|
||||
/*****************************************************************************
|
||||
* pixel.h: msa pixel metrics
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2015-2025 x264 project
|
||||
*
|
||||
* Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_MIPS_PIXEL_H
|
||||
#define X264_MIPS_PIXEL_H
|
||||
|
||||
#define x264_pixel_sad_16x16_msa x264_template(pixel_sad_16x16_msa)
|
||||
int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_sad_16x8_msa x264_template(pixel_sad_16x8_msa)
|
||||
int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_sad_8x16_msa x264_template(pixel_sad_8x16_msa)
|
||||
int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_sad_8x8_msa x264_template(pixel_sad_8x8_msa)
|
||||
int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_sad_8x4_msa x264_template(pixel_sad_8x4_msa)
|
||||
int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_sad_4x16_msa x264_template(pixel_sad_4x16_msa)
|
||||
int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_sad_4x8_msa x264_template(pixel_sad_4x8_msa)
|
||||
int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_sad_4x4_msa x264_template(pixel_sad_4x4_msa)
|
||||
int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_sad_x4_16x16_msa x264_template(pixel_sad_x4_16x16_msa)
|
||||
void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[4] );
|
||||
#define x264_pixel_sad_x4_16x8_msa x264_template(pixel_sad_x4_16x8_msa)
|
||||
void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[4] );
|
||||
#define x264_pixel_sad_x4_8x16_msa x264_template(pixel_sad_x4_8x16_msa)
|
||||
void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[4] );
|
||||
#define x264_pixel_sad_x4_8x8_msa x264_template(pixel_sad_x4_8x8_msa)
|
||||
void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[4] );
|
||||
#define x264_pixel_sad_x4_8x4_msa x264_template(pixel_sad_x4_8x4_msa)
|
||||
void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[4] );
|
||||
#define x264_pixel_sad_x4_4x8_msa x264_template(pixel_sad_x4_4x8_msa)
|
||||
void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[4] );
|
||||
#define x264_pixel_sad_x4_4x4_msa x264_template(pixel_sad_x4_4x4_msa)
|
||||
void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
uint8_t *p_ref3, intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[4] );
|
||||
#define x264_pixel_sad_x3_16x16_msa x264_template(pixel_sad_x3_16x16_msa)
|
||||
void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_pixel_sad_x3_16x8_msa x264_template(pixel_sad_x3_16x8_msa)
|
||||
void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_pixel_sad_x3_8x16_msa x264_template(pixel_sad_x3_8x16_msa)
|
||||
void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_pixel_sad_x3_8x8_msa x264_template(pixel_sad_x3_8x8_msa)
|
||||
void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_pixel_sad_x3_8x4_msa x264_template(pixel_sad_x3_8x4_msa)
|
||||
void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_pixel_sad_x3_4x8_msa x264_template(pixel_sad_x3_4x8_msa)
|
||||
void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_pixel_sad_x3_4x4_msa x264_template(pixel_sad_x3_4x4_msa)
|
||||
void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
|
||||
uint8_t *p_ref1, uint8_t *p_ref2,
|
||||
intptr_t i_ref_stride,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_pixel_ssd_16x16_msa x264_template(pixel_ssd_16x16_msa)
|
||||
int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_ssd_16x8_msa x264_template(pixel_ssd_16x8_msa)
|
||||
int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_ssd_8x16_msa x264_template(pixel_ssd_8x16_msa)
|
||||
int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_ssd_8x8_msa x264_template(pixel_ssd_8x8_msa)
|
||||
int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_ssd_8x4_msa x264_template(pixel_ssd_8x4_msa)
|
||||
int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_ssd_4x16_msa x264_template(pixel_ssd_4x16_msa)
|
||||
int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_ssd_4x8_msa x264_template(pixel_ssd_4x8_msa)
|
||||
int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_pixel_ssd_4x4_msa x264_template(pixel_ssd_4x4_msa)
|
||||
int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
|
||||
uint8_t *p_ref, intptr_t i_ref_stride );
|
||||
#define x264_intra_sad_x3_4x4_msa x264_template(intra_sad_x3_4x4_msa)
|
||||
void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_intra_sad_x3_16x16_msa x264_template(intra_sad_x3_16x16_msa)
|
||||
void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_intra_sad_x3_8x8_msa x264_template(intra_sad_x3_8x8_msa)
|
||||
void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_intra_sad_x3_8x8c_msa x264_template(intra_sad_x3_8x8c_msa)
|
||||
void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_ssim_4x4x2_core_msa x264_template(ssim_4x4x2_core_msa)
|
||||
void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
|
||||
const uint8_t *p_pix2, intptr_t i_stride2,
|
||||
int32_t i_sums[2][4] );
|
||||
#define x264_pixel_hadamard_ac_8x8_msa x264_template(pixel_hadamard_ac_8x8_msa)
|
||||
uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||
#define x264_pixel_hadamard_ac_8x16_msa x264_template(pixel_hadamard_ac_8x16_msa)
|
||||
uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||
#define x264_pixel_hadamard_ac_16x8_msa x264_template(pixel_hadamard_ac_16x8_msa)
|
||||
uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||
#define x264_pixel_hadamard_ac_16x16_msa x264_template(pixel_hadamard_ac_16x16_msa)
|
||||
uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||
#define x264_pixel_satd_4x4_msa x264_template(pixel_satd_4x4_msa)
|
||||
int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||
#define x264_pixel_satd_4x8_msa x264_template(pixel_satd_4x8_msa)
|
||||
int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||
#define x264_pixel_satd_4x16_msa x264_template(pixel_satd_4x16_msa)
|
||||
int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||
#define x264_pixel_satd_8x4_msa x264_template(pixel_satd_8x4_msa)
|
||||
int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||
#define x264_pixel_satd_8x8_msa x264_template(pixel_satd_8x8_msa)
|
||||
int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||
#define x264_pixel_satd_8x16_msa x264_template(pixel_satd_8x16_msa)
|
||||
int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||
#define x264_pixel_satd_16x8_msa x264_template(pixel_satd_16x8_msa)
|
||||
int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||
#define x264_pixel_satd_16x16_msa x264_template(pixel_satd_16x16_msa)
|
||||
int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||
#define x264_pixel_sa8d_8x8_msa x264_template(pixel_sa8d_8x8_msa)
|
||||
int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||
#define x264_pixel_sa8d_16x16_msa x264_template(pixel_sa8d_16x16_msa)
|
||||
int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
|
||||
uint8_t *p_pix2, intptr_t i_stride2 );
|
||||
#define x264_intra_satd_x3_4x4_msa x264_template(intra_satd_x3_4x4_msa)
|
||||
void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_intra_satd_x3_16x16_msa x264_template(intra_satd_x3_16x16_msa)
|
||||
void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_intra_sa8d_x3_8x8_msa x264_template(intra_sa8d_x3_8x8_msa)
|
||||
void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_intra_satd_x3_8x8c_msa x264_template(intra_satd_x3_8x8c_msa)
|
||||
void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
|
||||
int32_t p_sad_array[3] );
|
||||
#define x264_pixel_var_16x16_msa x264_template(pixel_var_16x16_msa)
|
||||
uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||
#define x264_pixel_var_8x16_msa x264_template(pixel_var_8x16_msa)
|
||||
uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||
#define x264_pixel_var_8x8_msa x264_template(pixel_var_8x8_msa)
|
||||
uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride );
|
||||
#define x264_pixel_var2_8x16_msa x264_template(pixel_var2_8x16_msa)
|
||||
int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
|
||||
uint8_t *p_pix2, intptr_t i_stride2,
|
||||
int32_t *p_ssd );
|
||||
#define x264_pixel_var2_8x8_msa x264_template(pixel_var2_8x8_msa)
|
||||
int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
|
||||
uint8_t *p_pix2, intptr_t i_stride2,
|
||||
int32_t *p_ssd );
|
||||
|
||||
#endif
|
||||
608
common/mips/predict-c.c
Normal file
608
common/mips/predict-c.c
Normal file
@@ -0,0 +1,608 @@
|
||||
/*****************************************************************************
|
||||
* predict-c.c: msa intra prediction
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2015-2025 x264 project
|
||||
*
|
||||
* Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common/common.h"
|
||||
#include "macros.h"
|
||||
#include "predict.h"
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst,
|
||||
int32_t i_dst_stride )
|
||||
{
|
||||
uint32_t u_src_data;
|
||||
|
||||
u_src_data = LW( p_src );
|
||||
|
||||
SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride );
|
||||
}
|
||||
|
||||
static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
|
||||
int32_t i_dst_stride )
|
||||
{
|
||||
uint64_t u_out;
|
||||
|
||||
u_out = LD( p_src );
|
||||
|
||||
SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
|
||||
p_dst += ( 4 * i_dst_stride );
|
||||
SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
|
||||
}
|
||||
|
||||
static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst,
|
||||
int32_t i_dst_stride )
|
||||
{
|
||||
v16u8 src0 = LD_UB( p_src );
|
||||
|
||||
ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
|
||||
i_dst_stride );
|
||||
p_dst += ( 8 * i_dst_stride );
|
||||
ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
|
||||
i_dst_stride );
|
||||
}
|
||||
|
||||
static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride,
|
||||
uint8_t *p_dst, int32_t i_dst_stride )
|
||||
{
|
||||
uint32_t u_out0, u_out1, u_out2, u_out3;
|
||||
|
||||
u_out0 = p_src[0 * i_src_stride] * 0x01010101;
|
||||
u_out1 = p_src[1 * i_src_stride] * 0x01010101;
|
||||
u_out2 = p_src[2 * i_src_stride] * 0x01010101;
|
||||
u_out3 = p_src[3 * i_src_stride] * 0x01010101;
|
||||
|
||||
SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
|
||||
}
|
||||
|
||||
static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
|
||||
uint8_t *p_dst, int32_t i_dst_stride )
|
||||
{
|
||||
uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
|
||||
|
||||
u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull;
|
||||
u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull;
|
||||
u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull;
|
||||
u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull;
|
||||
u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull;
|
||||
u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull;
|
||||
u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull;
|
||||
u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull;
|
||||
|
||||
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
|
||||
p_dst += ( 4 * i_dst_stride );
|
||||
SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
|
||||
}
|
||||
|
||||
static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride,
|
||||
uint8_t *p_dst,
|
||||
int32_t i_dst_stride )
|
||||
{
|
||||
uint32_t u_row;
|
||||
uint8_t u_inp0, u_inp1, u_inp2, u_inp3;
|
||||
v16u8 src0, src1, src2, src3;
|
||||
|
||||
for( u_row = 4; u_row--; )
|
||||
{
|
||||
u_inp0 = p_src[0];
|
||||
p_src += i_src_stride;
|
||||
u_inp1 = p_src[0];
|
||||
p_src += i_src_stride;
|
||||
u_inp2 = p_src[0];
|
||||
p_src += i_src_stride;
|
||||
u_inp3 = p_src[0];
|
||||
p_src += i_src_stride;
|
||||
|
||||
src0 = ( v16u8 ) __msa_fill_b( u_inp0 );
|
||||
src1 = ( v16u8 ) __msa_fill_b( u_inp1 );
|
||||
src2 = ( v16u8 ) __msa_fill_b( u_inp2 );
|
||||
src3 = ( v16u8 ) __msa_fill_b( u_inp3 );
|
||||
|
||||
ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
|
||||
p_dst += ( 4 * i_dst_stride );
|
||||
}
|
||||
}
|
||||
|
||||
static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left,
|
||||
int32_t i_src_stride_left,
|
||||
uint8_t *p_dst, int32_t i_dst_stride,
|
||||
uint8_t is_above, uint8_t is_left )
|
||||
{
|
||||
uint32_t u_row;
|
||||
uint32_t u_out, u_addition = 0;
|
||||
v16u8 src_above, store;
|
||||
v8u16 sum_above;
|
||||
v4u32 sum;
|
||||
|
||||
if( is_left && is_above )
|
||||
{
|
||||
src_above = LD_UB( p_src_top );
|
||||
|
||||
sum_above = __msa_hadd_u_h( src_above, src_above );
|
||||
sum = __msa_hadd_u_w( sum_above, sum_above );
|
||||
u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
|
||||
|
||||
for( u_row = 0; u_row < 4; u_row++ )
|
||||
{
|
||||
u_addition += p_src_left[u_row * i_src_stride_left];
|
||||
}
|
||||
|
||||
u_addition = ( u_addition + 4 ) >> 3;
|
||||
store = ( v16u8 ) __msa_fill_b( u_addition );
|
||||
}
|
||||
else if( is_left )
|
||||
{
|
||||
for( u_row = 0; u_row < 4; u_row++ )
|
||||
{
|
||||
u_addition += p_src_left[u_row * i_src_stride_left];
|
||||
}
|
||||
|
||||
u_addition = ( u_addition + 2 ) >> 2;
|
||||
store = ( v16u8 ) __msa_fill_b( u_addition );
|
||||
}
|
||||
else if( is_above )
|
||||
{
|
||||
src_above = LD_UB( p_src_top );
|
||||
|
||||
sum_above = __msa_hadd_u_h( src_above, src_above );
|
||||
sum = __msa_hadd_u_w( sum_above, sum_above );
|
||||
sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 );
|
||||
store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
|
||||
}
|
||||
else
|
||||
{
|
||||
store = ( v16u8 ) __msa_ldi_b( 128 );
|
||||
}
|
||||
|
||||
u_out = __msa_copy_u_w( ( v4i32 ) store, 0 );
|
||||
|
||||
SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
|
||||
}
|
||||
|
||||
static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left,
|
||||
uint8_t *p_dst, int32_t i_dst_stride )
|
||||
{
|
||||
uint64_t u_val0, u_val1;
|
||||
v16i8 store;
|
||||
v16u8 src = { 0 };
|
||||
v8u16 sum_h;
|
||||
v4u32 sum_w;
|
||||
v2u64 sum_d;
|
||||
|
||||
u_val0 = LD( p_src_top );
|
||||
u_val1 = LD( p_src_left );
|
||||
INSERT_D2_UB( u_val0, u_val1, src );
|
||||
sum_h = __msa_hadd_u_h( src, src );
|
||||
sum_w = __msa_hadd_u_w( sum_h, sum_h );
|
||||
sum_d = __msa_hadd_u_d( sum_w, sum_w );
|
||||
sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d );
|
||||
sum_d = __msa_hadd_u_d( sum_w, sum_w );
|
||||
sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 );
|
||||
store = __msa_splati_b( ( v16i8 ) sum_w, 0 );
|
||||
u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 );
|
||||
|
||||
SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
|
||||
p_dst += ( 4 * i_dst_stride );
|
||||
SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
|
||||
}
|
||||
|
||||
static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left,
|
||||
int32_t i_src_stride_left,
|
||||
uint8_t *p_dst, int32_t i_dst_stride,
|
||||
uint8_t is_above, uint8_t is_left )
|
||||
{
|
||||
uint32_t u_row;
|
||||
uint32_t u_addition = 0;
|
||||
v16u8 src_above, store;
|
||||
v8u16 sum_above;
|
||||
v4u32 sum_top;
|
||||
v2u64 sum;
|
||||
|
||||
if( is_left && is_above )
|
||||
{
|
||||
src_above = LD_UB( p_src_top );
|
||||
|
||||
sum_above = __msa_hadd_u_h( src_above, src_above );
|
||||
sum_top = __msa_hadd_u_w( sum_above, sum_above );
|
||||
sum = __msa_hadd_u_d( sum_top, sum_top );
|
||||
sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
|
||||
sum = __msa_hadd_u_d( sum_top, sum_top );
|
||||
u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
|
||||
|
||||
for( u_row = 0; u_row < 16; u_row++ )
|
||||
{
|
||||
u_addition += p_src_left[u_row * i_src_stride_left];
|
||||
}
|
||||
|
||||
u_addition = ( u_addition + 16 ) >> 5;
|
||||
store = ( v16u8 ) __msa_fill_b( u_addition );
|
||||
}
|
||||
else if( is_left )
|
||||
{
|
||||
for( u_row = 0; u_row < 16; u_row++ )
|
||||
{
|
||||
u_addition += p_src_left[u_row * i_src_stride_left];
|
||||
}
|
||||
|
||||
u_addition = ( u_addition + 8 ) >> 4;
|
||||
store = ( v16u8 ) __msa_fill_b( u_addition );
|
||||
}
|
||||
else if( is_above )
|
||||
{
|
||||
src_above = LD_UB( p_src_top );
|
||||
|
||||
sum_above = __msa_hadd_u_h( src_above, src_above );
|
||||
sum_top = __msa_hadd_u_w( sum_above, sum_above );
|
||||
sum = __msa_hadd_u_d( sum_top, sum_top );
|
||||
sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
|
||||
sum = __msa_hadd_u_d( sum_top, sum_top );
|
||||
sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 );
|
||||
store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
|
||||
}
|
||||
else
|
||||
{
|
||||
store = ( v16u8 ) __msa_ldi_b( 128 );
|
||||
}
|
||||
|
||||
ST_UB8( store, store, store, store, store, store, store, store, p_dst,
|
||||
i_dst_stride );
|
||||
p_dst += ( 8 * i_dst_stride );
|
||||
ST_UB8( store, store, store, store, store, store, store, store, p_dst,
|
||||
i_dst_stride );
|
||||
}
|
||||
|
||||
static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride )
|
||||
{
|
||||
uint8_t u_lpcnt;
|
||||
int32_t i_res, i_res0, i_res1, i_res2, i_res3;
|
||||
uint64_t u_out0, u_out1;
|
||||
v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
|
||||
v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
|
||||
v4i32 int_multiplier = { 0, 1, 2, 3 };
|
||||
v16u8 p_src_top;
|
||||
v8i16 vec9, vec10, vec11;
|
||||
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
|
||||
v2i64 sum;
|
||||
|
||||
p_src_top = LD_UB( p_src - ( i_stride + 1 ) );
|
||||
p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
|
||||
( v16i8 ) p_src_top );
|
||||
|
||||
vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
|
||||
vec9 *= short_multiplier;
|
||||
vec8 = __msa_hadd_s_w( vec9, vec9 );
|
||||
sum = __msa_hadd_s_d( vec8, vec8 );
|
||||
|
||||
i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 );
|
||||
|
||||
i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
|
||||
2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) +
|
||||
3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) +
|
||||
4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] );
|
||||
|
||||
i_res0 *= 17;
|
||||
i_res1 *= 17;
|
||||
i_res0 = ( i_res0 + 16 ) >> 5;
|
||||
i_res1 = ( i_res1 + 16 ) >> 5;
|
||||
|
||||
i_res3 = 3 * ( i_res0 + i_res1 );
|
||||
i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 );
|
||||
i_res = i_res2 - i_res3;
|
||||
|
||||
vec8 = __msa_fill_w( i_res0 );
|
||||
vec4 = __msa_fill_w( i_res );
|
||||
vec2 = __msa_fill_w( i_res1 );
|
||||
vec5 = vec8 * int_multiplier;
|
||||
vec3 = vec8 * 4;
|
||||
|
||||
for( u_lpcnt = 4; u_lpcnt--; )
|
||||
{
|
||||
vec0 = vec5;
|
||||
vec0 += vec4;
|
||||
vec1 = vec0 + vec3;
|
||||
vec6 = vec5;
|
||||
vec4 += vec2;
|
||||
vec6 += vec4;
|
||||
vec7 = vec6 + vec3;
|
||||
|
||||
SRA_4V( vec0, vec1, vec6, vec7, 5 );
|
||||
PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 );
|
||||
CLIP_SH2_0_255( vec10, vec11 );
|
||||
PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 );
|
||||
|
||||
u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 );
|
||||
u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 );
|
||||
SD( u_out0, p_src );
|
||||
p_src += i_stride;
|
||||
SD( u_out1, p_src );
|
||||
p_src += i_stride;
|
||||
|
||||
vec4 += vec2;
|
||||
}
|
||||
}
|
||||
|
||||
static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride )
|
||||
{
|
||||
uint8_t u_lpcnt;
|
||||
int32_t i_res0, i_res1, i_res2, i_res3;
|
||||
uint64_t u_load0, u_load1;
|
||||
v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
|
||||
v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
|
||||
v4i32 int_multiplier = { 0, 1, 2, 3 };
|
||||
v16u8 p_src_top = { 0 };
|
||||
v8i16 vec9, vec10;
|
||||
v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
|
||||
|
||||
u_load0 = LD( p_src - ( i_stride + 1 ) );
|
||||
u_load1 = LD( p_src - ( i_stride + 1 ) + 9 );
|
||||
|
||||
INSERT_D2_UB( u_load0, u_load1, p_src_top );
|
||||
|
||||
p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
|
||||
( v16i8 ) p_src_top );
|
||||
|
||||
vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
|
||||
vec9 *= short_multiplier;
|
||||
vec8 = __msa_hadd_s_w( vec9, vec9 );
|
||||
res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 );
|
||||
|
||||
i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 );
|
||||
|
||||
i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) +
|
||||
2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) +
|
||||
3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) +
|
||||
4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) +
|
||||
5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
|
||||
6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) +
|
||||
7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) +
|
||||
8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] );
|
||||
|
||||
i_res0 *= 5;
|
||||
i_res1 *= 5;
|
||||
i_res0 = ( i_res0 + 32 ) >> 6;
|
||||
i_res1 = ( i_res1 + 32 ) >> 6;
|
||||
|
||||
i_res3 = 7 * ( i_res0 + i_res1 );
|
||||
i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 );
|
||||
i_res2 -= i_res3;
|
||||
|
||||
vec8 = __msa_fill_w( i_res0 );
|
||||
vec4 = __msa_fill_w( i_res2 );
|
||||
vec5 = __msa_fill_w( i_res1 );
|
||||
vec6 = vec8 * 4;
|
||||
vec7 = vec8 * int_multiplier;
|
||||
|
||||
for( u_lpcnt = 16; u_lpcnt--; )
|
||||
{
|
||||
vec0 = vec7;
|
||||
vec0 += vec4;
|
||||
vec1 = vec0 + vec6;
|
||||
vec2 = vec1 + vec6;
|
||||
vec3 = vec2 + vec6;
|
||||
|
||||
SRA_4V( vec0, vec1, vec2, vec3, 5 );
|
||||
PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 );
|
||||
CLIP_SH2_0_255( vec9, vec10 );
|
||||
PCKEV_ST_SB( vec9, vec10, p_src );
|
||||
p_src += i_stride;
|
||||
|
||||
vec4 += vec5;
|
||||
}
|
||||
}
|
||||
|
||||
static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride )
|
||||
{
|
||||
uint8_t u_lp_cnt;
|
||||
uint32_t u_src0, u_src1, u_src3, u_src2 = 0;
|
||||
uint32_t u_out0, u_out1, u_out2, u_out3;
|
||||
v16u8 p_src_top;
|
||||
v8u16 add;
|
||||
v4u32 sum;
|
||||
|
||||
p_src_top = LD_UB( p_src - i_stride );
|
||||
add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top );
|
||||
sum = __msa_hadd_u_w( add, add );
|
||||
u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 );
|
||||
u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 );
|
||||
|
||||
for( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ )
|
||||
{
|
||||
u_src0 += p_src[u_lp_cnt * i_stride - 1];
|
||||
u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1];
|
||||
}
|
||||
|
||||
u_src0 = ( u_src0 + 4 ) >> 3;
|
||||
u_src3 = ( u_src1 + u_src2 + 4 ) >> 3;
|
||||
u_src1 = ( u_src1 + 2 ) >> 2;
|
||||
u_src2 = ( u_src2 + 2 ) >> 2;
|
||||
|
||||
u_out0 = u_src0 * 0x01010101;
|
||||
u_out1 = u_src1 * 0x01010101;
|
||||
u_out2 = u_src2 * 0x01010101;
|
||||
u_out3 = u_src3 * 0x01010101;
|
||||
|
||||
for( u_lp_cnt = 4; u_lp_cnt--; )
|
||||
{
|
||||
SW( u_out0, p_src );
|
||||
SW( u_out1, ( p_src + 4 ) );
|
||||
SW( u_out2, ( p_src + 4 * i_stride ) );
|
||||
SW( u_out3, ( p_src + 4 * i_stride + 4 ) );
|
||||
p_src += i_stride;
|
||||
}
|
||||
}
|
||||
|
||||
static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
|
||||
int32_t i_dst_stride )
|
||||
{
|
||||
uint8_t u_src_val = p_src[15];
|
||||
uint64_t u_out0, u_out1, u_out2, u_out3;
|
||||
v16u8 src, vec4, vec5, res0;
|
||||
v8u16 vec0, vec1, vec2, vec3;
|
||||
v2i64 res1, res2, res3;
|
||||
|
||||
src = LD_UB( p_src );
|
||||
|
||||
vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 );
|
||||
vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 );
|
||||
vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val );
|
||||
ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 );
|
||||
ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 );
|
||||
HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 );
|
||||
|
||||
vec0 += vec1;
|
||||
vec2 += vec3;
|
||||
vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 );
|
||||
vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 );
|
||||
|
||||
res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 );
|
||||
res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
|
||||
res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
|
||||
res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
|
||||
|
||||
u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
|
||||
u_out1 = __msa_copy_u_d( res1, 0 );
|
||||
u_out2 = __msa_copy_u_d( res2, 0 );
|
||||
u_out3 = __msa_copy_u_d( res3, 0 );
|
||||
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
|
||||
p_dst += ( 4 * i_dst_stride );
|
||||
|
||||
res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 );
|
||||
res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
|
||||
res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
|
||||
res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
|
||||
|
||||
u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
|
||||
u_out1 = __msa_copy_u_d( res1, 0 );
|
||||
u_out2 = __msa_copy_u_d( res2, 0 );
|
||||
u_out3 = __msa_copy_u_d( res3, 0 );
|
||||
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
|
||||
}
|
||||
|
||||
static void intra_predict_128dc_16x16_msa( uint8_t *p_dst,
|
||||
int32_t i_dst_stride )
|
||||
{
|
||||
v16u8 out = ( v16u8 ) __msa_ldi_b( 128 );
|
||||
|
||||
ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
|
||||
p_dst += ( 8 * i_dst_stride );
|
||||
ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
|
||||
}
|
||||
|
||||
void x264_intra_predict_dc_16x16_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
|
||||
FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
|
||||
}
|
||||
|
||||
void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
|
||||
FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 );
|
||||
}
|
||||
|
||||
void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
|
||||
FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 );
|
||||
}
|
||||
|
||||
void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_hor_16x16_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE,
|
||||
p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_vert_16x16_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_plane_16x16_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_hor_8x8_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE,
|
||||
p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_vert_8x8_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_plane_8x8_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
|
||||
{
|
||||
intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
|
||||
{
|
||||
intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ),
|
||||
p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
|
||||
{
|
||||
intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
|
||||
{
|
||||
intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_dc_4x4_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
|
||||
FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
|
||||
}
|
||||
|
||||
void x264_intra_predict_hor_4x4_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE,
|
||||
p_src, FDEC_STRIDE );
|
||||
}
|
||||
|
||||
void x264_intra_predict_vert_4x4_msa( uint8_t *p_src )
|
||||
{
|
||||
intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
|
||||
}
|
||||
#endif
|
||||
66
common/mips/predict.h
Normal file
66
common/mips/predict.h
Normal file
@@ -0,0 +1,66 @@
|
||||
/*****************************************************************************
|
||||
* predict.h: msa intra prediction
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2015-2025 x264 project
|
||||
*
|
||||
* Authors: Rishikesh More <rishikesh.more@imgtec.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_MIPS_PREDICT_H
|
||||
#define X264_MIPS_PREDICT_H
|
||||
|
||||
#define x264_intra_predict_dc_16x16_msa x264_template(intra_predict_dc_16x16_msa)
|
||||
void x264_intra_predict_dc_16x16_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_dc_left_16x16_msa x264_template(intra_predict_dc_left_16x16_msa)
|
||||
void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_dc_top_16x16_msa x264_template(intra_predict_dc_top_16x16_msa)
|
||||
void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_dc_128_16x16_msa x264_template(intra_predict_dc_128_16x16_msa)
|
||||
void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_hor_16x16_msa x264_template(intra_predict_hor_16x16_msa)
|
||||
void x264_intra_predict_hor_16x16_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_vert_16x16_msa x264_template(intra_predict_vert_16x16_msa)
|
||||
void x264_intra_predict_vert_16x16_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_plane_16x16_msa x264_template(intra_predict_plane_16x16_msa)
|
||||
void x264_intra_predict_plane_16x16_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_dc_4blk_8x8_msa x264_template(intra_predict_dc_4blk_8x8_msa)
|
||||
void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_hor_8x8_msa x264_template(intra_predict_hor_8x8_msa)
|
||||
void x264_intra_predict_hor_8x8_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_vert_8x8_msa x264_template(intra_predict_vert_8x8_msa)
|
||||
void x264_intra_predict_vert_8x8_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_plane_8x8_msa x264_template(intra_predict_plane_8x8_msa)
|
||||
void x264_intra_predict_plane_8x8_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_ddl_8x8_msa x264_template(intra_predict_ddl_8x8_msa)
|
||||
void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
|
||||
#define x264_intra_predict_dc_8x8_msa x264_template(intra_predict_dc_8x8_msa)
|
||||
void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
|
||||
#define x264_intra_predict_h_8x8_msa x264_template(intra_predict_h_8x8_msa)
|
||||
void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
|
||||
#define x264_intra_predict_v_8x8_msa x264_template(intra_predict_v_8x8_msa)
|
||||
void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] );
|
||||
#define x264_intra_predict_dc_4x4_msa x264_template(intra_predict_dc_4x4_msa)
|
||||
void x264_intra_predict_dc_4x4_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_hor_4x4_msa x264_template(intra_predict_hor_4x4_msa)
|
||||
void x264_intra_predict_hor_4x4_msa( uint8_t *p_src );
|
||||
#define x264_intra_predict_vert_4x4_msa x264_template(intra_predict_vert_4x4_msa)
|
||||
void x264_intra_predict_vert_4x4_msa( uint8_t *p_src );
|
||||
|
||||
#endif
|
||||
631
common/mips/quant-c.c
Normal file
631
common/mips/quant-c.c
Normal file
@@ -0,0 +1,631 @@
|
||||
/*****************************************************************************
|
||||
* quant-c.c: msa quantization and level-run
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2015-2025 x264 project
|
||||
*
|
||||
* Authors: Rishikesh More <rishikesh.more@imgtec.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common/common.h"
|
||||
#include "macros.h"
|
||||
#include "quant.h"
|
||||
|
||||
#if !HIGH_BIT_DEPTH
|
||||
static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
|
||||
int32_t i_qp )
|
||||
{
|
||||
const int32_t i_mf = i_qp % 6;
|
||||
const int32_t q_bits = i_qp / 6 - 4;
|
||||
v8i16 dct0, dct1;
|
||||
v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
|
||||
|
||||
LD_SH2( p_dct, 8, dct0, dct1 );
|
||||
|
||||
LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
|
||||
LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
|
||||
|
||||
if( q_bits >= 0 )
|
||||
{
|
||||
v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec;
|
||||
|
||||
q_bits_vec = __msa_fill_h( q_bits );
|
||||
|
||||
PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
|
||||
dequant_mf_h0, dequant_mf_h1 );
|
||||
|
||||
dct0 *= dequant_mf_h0;
|
||||
dct1 *= dequant_mf_h1;
|
||||
dct0 <<= q_bits_vec;
|
||||
dct1 <<= q_bits_vec;
|
||||
ST_SH2( dct0, dct1, p_dct, 8 );
|
||||
}
|
||||
else
|
||||
{
|
||||
const int32_t q_bits_add = 1 << ( -q_bits - 1 );
|
||||
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
|
||||
v4i32 q_bits_vec, q_bits_vec_add;
|
||||
|
||||
q_bits_vec_add = __msa_fill_w( q_bits_add );
|
||||
q_bits_vec = __msa_fill_w( -q_bits );
|
||||
|
||||
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
|
||||
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
|
||||
|
||||
dct_signed_w0 *= dequant_m_f0;
|
||||
dct_signed_w1 *= dequant_m_f1;
|
||||
dct_signed_w2 *= dequant_m_f2;
|
||||
dct_signed_w3 *= dequant_m_f3;
|
||||
dct_signed_w0 += q_bits_vec_add;
|
||||
dct_signed_w1 += q_bits_vec_add;
|
||||
dct_signed_w2 += q_bits_vec_add;
|
||||
dct_signed_w3 += q_bits_vec_add;
|
||||
|
||||
SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
|
||||
q_bits_vec );
|
||||
PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
|
||||
dct0, dct1 );
|
||||
ST_SH2( dct0, dct1, p_dct, 8 );
|
||||
}
|
||||
}
|
||||
|
||||
static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
|
||||
int32_t i_qp )
|
||||
{
|
||||
const int32_t i_mf = i_qp % 6;
|
||||
const int32_t q_bits = i_qp / 6 - 6;
|
||||
v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7;
|
||||
v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
|
||||
v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7;
|
||||
v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11;
|
||||
v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15;
|
||||
|
||||
LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 );
|
||||
|
||||
LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
|
||||
LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
|
||||
LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 );
|
||||
LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 );
|
||||
LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 );
|
||||
LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 );
|
||||
LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 );
|
||||
LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 );
|
||||
|
||||
if( q_bits >= 0 )
|
||||
{
|
||||
v8i16 q_bits_vec;
|
||||
v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3;
|
||||
v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7;
|
||||
|
||||
q_bits_vec = __msa_fill_h( q_bits );
|
||||
|
||||
PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
|
||||
dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6,
|
||||
dequant_mf_h0, dequant_mf_h1,
|
||||
dequant_mf_h2, dequant_mf_h3 );
|
||||
PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10,
|
||||
dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14,
|
||||
dequant_mf_h4, dequant_mf_h5,
|
||||
dequant_mf_h6, dequant_mf_h7 );
|
||||
|
||||
dct0 *= dequant_mf_h0;
|
||||
dct1 *= dequant_mf_h1;
|
||||
dct2 *= dequant_mf_h2;
|
||||
dct3 *= dequant_mf_h3;
|
||||
dct4 *= dequant_mf_h4;
|
||||
dct5 *= dequant_mf_h5;
|
||||
dct6 *= dequant_mf_h6;
|
||||
dct7 *= dequant_mf_h7;
|
||||
|
||||
SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec );
|
||||
SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec );
|
||||
|
||||
ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
|
||||
}
|
||||
else
|
||||
{
|
||||
const int32_t q_bits_add = 1 << ( -q_bits - 1 );
|
||||
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
|
||||
v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
|
||||
v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11;
|
||||
v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15;
|
||||
v4i32 q_bits_vec, q_bits_vec_add;
|
||||
|
||||
q_bits_vec_add = __msa_fill_w( q_bits_add );
|
||||
q_bits_vec = __msa_fill_w( -q_bits );
|
||||
|
||||
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
|
||||
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
|
||||
UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
|
||||
UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
|
||||
UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 );
|
||||
UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 );
|
||||
UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 );
|
||||
UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 );
|
||||
|
||||
dct_signed_w0 *= dequant_m_f0;
|
||||
dct_signed_w1 *= dequant_m_f1;
|
||||
dct_signed_w2 *= dequant_m_f2;
|
||||
dct_signed_w3 *= dequant_m_f3;
|
||||
dct_signed_w4 *= dequant_m_f4;
|
||||
dct_signed_w5 *= dequant_m_f5;
|
||||
dct_signed_w6 *= dequant_m_f6;
|
||||
dct_signed_w7 *= dequant_m_f7;
|
||||
dct_signed_w8 *= dequant_m_f8;
|
||||
dct_signed_w9 *= dequant_m_f9;
|
||||
dct_signed_w10 *= dequant_m_f10;
|
||||
dct_signed_w11 *= dequant_m_f11;
|
||||
dct_signed_w12 *= dequant_m_f12;
|
||||
dct_signed_w13 *= dequant_m_f13;
|
||||
dct_signed_w14 *= dequant_m_f14;
|
||||
dct_signed_w15 *= dequant_m_f15;
|
||||
|
||||
dct_signed_w0 += q_bits_vec_add;
|
||||
dct_signed_w1 += q_bits_vec_add;
|
||||
dct_signed_w2 += q_bits_vec_add;
|
||||
dct_signed_w3 += q_bits_vec_add;
|
||||
dct_signed_w4 += q_bits_vec_add;
|
||||
dct_signed_w5 += q_bits_vec_add;
|
||||
dct_signed_w6 += q_bits_vec_add;
|
||||
dct_signed_w7 += q_bits_vec_add;
|
||||
dct_signed_w8 += q_bits_vec_add;
|
||||
dct_signed_w9 += q_bits_vec_add;
|
||||
dct_signed_w10 += q_bits_vec_add;
|
||||
dct_signed_w11 += q_bits_vec_add;
|
||||
dct_signed_w12 += q_bits_vec_add;
|
||||
dct_signed_w13 += q_bits_vec_add;
|
||||
dct_signed_w14 += q_bits_vec_add;
|
||||
dct_signed_w15 += q_bits_vec_add;
|
||||
|
||||
SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
|
||||
q_bits_vec );
|
||||
SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7,
|
||||
q_bits_vec );
|
||||
SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11,
|
||||
q_bits_vec );
|
||||
SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15,
|
||||
q_bits_vec );
|
||||
PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
|
||||
dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6,
|
||||
dct0, dct1, dct2, dct3 );
|
||||
PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11,
|
||||
dct_signed_w10, dct_signed_w13, dct_signed_w12,
|
||||
dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 );
|
||||
ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
|
||||
}
|
||||
}
|
||||
|
||||
static void avc_dequant_4x4_dc_msa( int16_t *p_dct,
|
||||
int32_t pi_dequant_mf[6][16],
|
||||
int32_t i_qp )
|
||||
{
|
||||
const int32_t q_bits = i_qp / 6 - 6;
|
||||
int32_t i_dmf = pi_dequant_mf[i_qp % 6][0];
|
||||
v8i16 dct0, dct1, dequant_mf_h;
|
||||
|
||||
LD_SH2( p_dct, 8, dct0, dct1 );
|
||||
|
||||
if( q_bits >= 0 )
|
||||
{
|
||||
i_dmf <<= q_bits;
|
||||
|
||||
dequant_mf_h = __msa_fill_h( i_dmf );
|
||||
dct0 = dct0 * dequant_mf_h;
|
||||
dct1 = dct1 * dequant_mf_h;
|
||||
|
||||
ST_SH2( dct0, dct1, p_dct, 8 );
|
||||
}
|
||||
else
|
||||
{
|
||||
const int32_t q_bits_add = 1 << ( -q_bits - 1 );
|
||||
v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add;
|
||||
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
|
||||
|
||||
q_bits_vec_add = __msa_fill_w( q_bits_add );
|
||||
q_bits_vec = __msa_fill_w( -q_bits );
|
||||
|
||||
dequant_m_f = __msa_fill_w( i_dmf );
|
||||
|
||||
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
|
||||
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
|
||||
|
||||
dct_signed_w0 *= dequant_m_f;
|
||||
dct_signed_w1 *= dequant_m_f;
|
||||
dct_signed_w2 *= dequant_m_f;
|
||||
dct_signed_w3 *= dequant_m_f;
|
||||
|
||||
dct_signed_w0 += q_bits_vec_add;
|
||||
dct_signed_w1 += q_bits_vec_add;
|
||||
dct_signed_w2 += q_bits_vec_add;
|
||||
dct_signed_w3 += q_bits_vec_add;
|
||||
|
||||
SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
|
||||
q_bits_vec );
|
||||
PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
|
||||
dct0, dct1 );
|
||||
ST_SH2( dct0, dct1, p_dct, 8 );
|
||||
}
|
||||
}
|
||||
|
||||
static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf,
|
||||
uint16_t *p_bias )
|
||||
{
|
||||
int32_t non_zero = 0;
|
||||
v8i16 dct0, dct1;
|
||||
v8i16 zero = { 0 };
|
||||
v8i16 dct0_mask, dct1_mask;
|
||||
v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1;
|
||||
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
|
||||
v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
|
||||
v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
|
||||
v4i32 bias0, bias1, bias2, bias3;
|
||||
|
||||
LD_SH2( p_dct, 8, dct0, dct1 );
|
||||
LD_SH2( p_bias, 8, bias_h0, bias_h1 );
|
||||
LD_SH2( p_mf, 8, mf_h0, mf_h1 );
|
||||
|
||||
dct0_mask = __msa_clei_s_h( dct0, 0 );
|
||||
dct1_mask = __msa_clei_s_h( dct1, 0 );
|
||||
|
||||
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
|
||||
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
|
||||
ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 );
|
||||
ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 );
|
||||
ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 );
|
||||
ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 );
|
||||
|
||||
dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
|
||||
dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
|
||||
dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
|
||||
dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
|
||||
|
||||
dct_w0 *= mf_vec0;
|
||||
dct_w1 *= mf_vec1;
|
||||
dct_w2 *= mf_vec2;
|
||||
dct_w3 *= mf_vec3;
|
||||
|
||||
SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
|
||||
PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
|
||||
|
||||
dct0 = zero - dct_h0;
|
||||
dct1 = zero - dct_h1;
|
||||
|
||||
dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0,
|
||||
( v16u8 ) dct0_mask );
|
||||
dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1,
|
||||
( v16u8 ) dct1_mask );
|
||||
non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
|
||||
ST_SH2( dct0, dct1, p_dct, 8 );
|
||||
|
||||
return !!non_zero;
|
||||
}
|
||||
|
||||
static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf,
|
||||
uint16_t *p_bias )
|
||||
{
|
||||
int32_t non_zero = 0;
|
||||
v8i16 dct0, dct1, dct2, dct3;
|
||||
v8i16 zero = { 0 };
|
||||
v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask;
|
||||
v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3;
|
||||
v8i16 bias_h0, bias_h1, bias_h2, bias_h3;
|
||||
v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7;
|
||||
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
|
||||
v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
|
||||
v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
|
||||
v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7;
|
||||
v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7;
|
||||
|
||||
LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 );
|
||||
|
||||
dct0_mask = __msa_clei_s_h( dct0, 0 );
|
||||
dct1_mask = __msa_clei_s_h( dct1, 0 );
|
||||
dct2_mask = __msa_clei_s_h( dct2, 0 );
|
||||
dct3_mask = __msa_clei_s_h( dct3, 0 );
|
||||
|
||||
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
|
||||
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
|
||||
UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
|
||||
UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
|
||||
LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
|
||||
ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
|
||||
bias0, bias2, bias4, bias6 );
|
||||
ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
|
||||
bias1, bias3, bias5, bias7 );
|
||||
LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
|
||||
ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
|
||||
mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
|
||||
ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
|
||||
mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
|
||||
|
||||
dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
|
||||
dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
|
||||
dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
|
||||
dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
|
||||
dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
|
||||
dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
|
||||
dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
|
||||
dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
|
||||
|
||||
dct_w0 *= mf_vec0;
|
||||
dct_w1 *= mf_vec1;
|
||||
dct_w2 *= mf_vec2;
|
||||
dct_w3 *= mf_vec3;
|
||||
dct_w4 *= mf_vec4;
|
||||
dct_w5 *= mf_vec5;
|
||||
dct_w6 *= mf_vec6;
|
||||
dct_w7 *= mf_vec7;
|
||||
|
||||
SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
|
||||
SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
|
||||
PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6,
|
||||
dct_h0, dct_h1, dct_h2, dct_h3 );
|
||||
SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
|
||||
dct0, dct1, dct2, dct3 );
|
||||
|
||||
dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
|
||||
( v16u8 ) dct0, ( v16u8 ) dct0_mask );
|
||||
dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
|
||||
( v16u8 ) dct1, ( v16u8 ) dct1_mask );
|
||||
dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
|
||||
( v16u8 ) dct2, ( v16u8 ) dct2_mask );
|
||||
dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
|
||||
( v16u8 ) dct3, ( v16u8 ) dct3_mask );
|
||||
|
||||
non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
|
||||
ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 );
|
||||
LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 );
|
||||
|
||||
dct0_mask = __msa_clei_s_h( dct0, 0 );
|
||||
dct1_mask = __msa_clei_s_h( dct1, 0 );
|
||||
dct2_mask = __msa_clei_s_h( dct2, 0 );
|
||||
dct3_mask = __msa_clei_s_h( dct3, 0 );
|
||||
|
||||
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
|
||||
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
|
||||
UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
|
||||
UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
|
||||
LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
|
||||
ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
|
||||
bias0, bias2, bias4, bias6 );
|
||||
ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
|
||||
bias1, bias3, bias5, bias7 );
|
||||
LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
|
||||
ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
|
||||
mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
|
||||
ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
|
||||
mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
|
||||
|
||||
dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
|
||||
dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
|
||||
dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
|
||||
dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
|
||||
dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
|
||||
dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
|
||||
dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
|
||||
dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
|
||||
|
||||
dct_w0 *= mf_vec0;
|
||||
dct_w1 *= mf_vec1;
|
||||
dct_w2 *= mf_vec2;
|
||||
dct_w3 *= mf_vec3;
|
||||
dct_w4 *= mf_vec4;
|
||||
dct_w5 *= mf_vec5;
|
||||
dct_w6 *= mf_vec6;
|
||||
dct_w7 *= mf_vec7;
|
||||
|
||||
SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
|
||||
SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
|
||||
PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
|
||||
PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 );
|
||||
SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
|
||||
dct0, dct1, dct2, dct3 );
|
||||
|
||||
dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
|
||||
( v16u8 ) dct0, ( v16u8 ) dct0_mask );
|
||||
dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
|
||||
( v16u8 ) dct1, ( v16u8 ) dct1_mask );
|
||||
dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
|
||||
( v16u8 ) dct2, ( v16u8 ) dct2_mask );
|
||||
dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
|
||||
( v16u8 ) dct3, ( v16u8 ) dct3_mask );
|
||||
|
||||
non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
|
||||
ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 );
|
||||
|
||||
return !!non_zero;
|
||||
}
|
||||
|
||||
static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf,
|
||||
int32_t i_bias )
|
||||
{
|
||||
int32_t non_zero = 0;
|
||||
v8i16 dct0, dct1, dct0_mask, dct1_mask;
|
||||
v8i16 zero = { 0 };
|
||||
v8i16 dct_h0, dct_h1;
|
||||
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
|
||||
v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
|
||||
v4i32 mf_vec, bias_vec;
|
||||
|
||||
LD_SH2( p_dct, 8, dct0, dct1 );
|
||||
|
||||
dct0_mask = __msa_clei_s_h( dct0, 0 );
|
||||
dct1_mask = __msa_clei_s_h( dct1, 0 );
|
||||
|
||||
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
|
||||
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
|
||||
|
||||
bias_vec = __msa_fill_w( i_bias );
|
||||
mf_vec = __msa_fill_w( i_mf );
|
||||
|
||||
dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec );
|
||||
dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec );
|
||||
dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec );
|
||||
dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec );
|
||||
|
||||
dct_w0 *= mf_vec;
|
||||
dct_w1 *= mf_vec;
|
||||
dct_w2 *= mf_vec;
|
||||
dct_w3 *= mf_vec;
|
||||
|
||||
SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
|
||||
PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
|
||||
|
||||
dct0 = zero - dct_h0;
|
||||
dct1 = zero - dct_h1;
|
||||
dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
|
||||
( v16u8 ) dct0, ( v16u8 ) dct0_mask );
|
||||
dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
|
||||
( v16u8 ) dct1, ( v16u8 ) dct1_mask );
|
||||
non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
|
||||
|
||||
ST_SH2( dct0, dct1, p_dct, 8 );
|
||||
|
||||
return !!non_zero;
|
||||
}
|
||||
|
||||
static int32_t avc_coeff_last64_msa( int16_t *p_src )
|
||||
{
|
||||
uint32_t u_res;
|
||||
v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v8i16 tmp_h0, tmp_h1, tmp_h2, tmp_h3, tmp_h4, tmp_h5, tmp_h6, tmp_h7;
|
||||
v16u8 tmp0, tmp1, tmp2, tmp3;
|
||||
v8u16 vec0, vec1, vec2, vec3;
|
||||
v4i32 out0;
|
||||
v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
|
||||
|
||||
LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
|
||||
|
||||
tmp_h0 = __msa_ceqi_h( src0, 0 );
|
||||
tmp_h1 = __msa_ceqi_h( src1, 0 );
|
||||
tmp_h2 = __msa_ceqi_h( src2, 0 );
|
||||
tmp_h3 = __msa_ceqi_h( src3, 0 );
|
||||
tmp_h4 = __msa_ceqi_h( src4, 0 );
|
||||
tmp_h5 = __msa_ceqi_h( src5, 0 );
|
||||
tmp_h6 = __msa_ceqi_h( src6, 0 );
|
||||
tmp_h7 = __msa_ceqi_h( src7, 0 );
|
||||
|
||||
PCKEV_B4_UB( tmp_h1, tmp_h0, tmp_h3, tmp_h2, tmp_h5, tmp_h4, tmp_h7, tmp_h6,
|
||||
tmp0, tmp1, tmp2, tmp3 );
|
||||
|
||||
tmp0 = tmp0 & mask;
|
||||
tmp1 = tmp1 & mask;
|
||||
tmp2 = tmp2 & mask;
|
||||
tmp3 = tmp3 & mask;
|
||||
|
||||
HADD_UB4_UH( tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3 );
|
||||
PCKEV_B2_UB( vec1, vec0, vec3, vec2, tmp0, tmp1 );
|
||||
HADD_UB2_UH( tmp0, tmp1, vec0, vec1 );
|
||||
|
||||
tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec1, ( v16i8 ) vec0 );
|
||||
vec0 = __msa_hadd_u_h( tmp0, tmp0 );
|
||||
tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec0, ( v16i8 ) vec0 );
|
||||
out0 = ( v4i32 ) __msa_nloc_d( ( v2i64 ) tmp0 );
|
||||
u_res = __msa_copy_u_w( out0, 0 );
|
||||
|
||||
return ( 63 - u_res );
|
||||
}
|
||||
|
||||
static int32_t avc_coeff_last16_msa( int16_t *p_src )
|
||||
{
|
||||
uint32_t u_res;
|
||||
v8i16 src0, src1;
|
||||
v8u16 tmp_h0;
|
||||
v16u8 tmp0;
|
||||
v8i16 out0, out1;
|
||||
v16i8 res0;
|
||||
v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
|
||||
|
||||
LD_SH2( p_src, 8, src0, src1 );
|
||||
|
||||
out0 = __msa_ceqi_h( src0, 0 );
|
||||
out1 = __msa_ceqi_h( src1, 0 );
|
||||
|
||||
tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) out1, ( v16i8 ) out0 );
|
||||
tmp0 = tmp0 & mask;
|
||||
tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
|
||||
tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
|
||||
tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
|
||||
tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
|
||||
tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
|
||||
res0 = __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
|
||||
out0 = __msa_nloc_h( ( v8i16 ) res0 );
|
||||
u_res = __msa_copy_u_h( out0, 0 );
|
||||
|
||||
return ( 15 - u_res );
|
||||
}
|
||||
|
||||
void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
|
||||
int32_t i_qp )
|
||||
{
|
||||
avc_dequant_4x4_msa( p_dct, pi_dequant_mf, i_qp );
|
||||
}
|
||||
|
||||
void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
|
||||
int32_t i_qp )
|
||||
{
|
||||
avc_dequant_8x8_msa( p_dct, pi_dequant_mf, i_qp );
|
||||
}
|
||||
|
||||
void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
|
||||
int32_t i_qp )
|
||||
{
|
||||
avc_dequant_4x4_dc_msa( p_dct, pi_dequant_mf, i_qp );
|
||||
}
|
||||
|
||||
int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
|
||||
{
|
||||
return avc_quant_4x4_msa( p_dct, p_mf, p_bias );
|
||||
}
|
||||
|
||||
int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
|
||||
uint16_t pu_mf[16], uint16_t pu_bias[16] )
|
||||
{
|
||||
int32_t i_non_zero, i_non_zero_acc = 0;
|
||||
|
||||
for( int32_t j = 0; j < 4; j++ )
|
||||
{
|
||||
i_non_zero = x264_quant_4x4_msa( p_dct[j], pu_mf, pu_bias );
|
||||
|
||||
i_non_zero_acc |= ( !!i_non_zero ) << j;
|
||||
}
|
||||
|
||||
return i_non_zero_acc;
|
||||
}
|
||||
|
||||
int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
|
||||
{
|
||||
return avc_quant_8x8_msa( p_dct, p_mf, p_bias );
|
||||
}
|
||||
|
||||
int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias )
|
||||
{
|
||||
return avc_quant_4x4_dc_msa( p_dct, i_mf, i_bias );
|
||||
}
|
||||
|
||||
int32_t x264_coeff_last64_msa( int16_t *p_src )
|
||||
{
|
||||
return avc_coeff_last64_msa( p_src );
|
||||
}
|
||||
|
||||
int32_t x264_coeff_last16_msa( int16_t *p_src )
|
||||
{
|
||||
return avc_coeff_last16_msa( p_src );
|
||||
}
|
||||
#endif
|
||||
52
common/mips/quant.h
Normal file
52
common/mips/quant.h
Normal file
@@ -0,0 +1,52 @@
|
||||
/*****************************************************************************
|
||||
* quant.h: msa quantization and level-run
|
||||
*****************************************************************************
|
||||
* Copyright (C) 2015-2025 x264 project
|
||||
*
|
||||
* Authors: Rishikesh More <rishikesh.more@imgtec.com>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
|
||||
*
|
||||
* This program is also available under a commercial proprietary license.
|
||||
* For more information, contact us at licensing@x264.com.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifndef X264_MIPS_QUANT_H
|
||||
#define X264_MIPS_QUANT_H
|
||||
|
||||
#define x264_dequant_4x4_msa x264_template(dequant_4x4_msa)
|
||||
void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
|
||||
int32_t i_qp );
|
||||
#define x264_dequant_8x8_msa x264_template(dequant_8x8_msa)
|
||||
void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
|
||||
int32_t i_qp );
|
||||
#define x264_dequant_4x4_dc_msa x264_template(dequant_4x4_dc_msa)
|
||||
void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
|
||||
int32_t i_qp );
|
||||
#define x264_quant_4x4_msa x264_template(quant_4x4_msa)
|
||||
int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
|
||||
#define x264_quant_4x4x4_msa x264_template(quant_4x4x4_msa)
|
||||
int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
|
||||
uint16_t pu_mf[16], uint16_t pu_bias[16] );
|
||||
#define x264_quant_8x8_msa x264_template(quant_8x8_msa)
|
||||
int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias );
|
||||
#define x264_quant_4x4_dc_msa x264_template(quant_4x4_dc_msa)
|
||||
int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias );
|
||||
#define x264_coeff_last64_msa x264_template(coeff_last64_msa)
|
||||
int32_t x264_coeff_last64_msa( int16_t *p_src );
|
||||
#define x264_coeff_last16_msa x264_template(coeff_last16_msa)
|
||||
int32_t x264_coeff_last16_msa( int16_t *p_src );
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user