Commit fb73ceae authored by Parag Salasakar's avatar Parag Salasakar

mips msa vp8 bilinear filter optimization

average improvement ~3x-4x

Change-Id: I8c0b3d5c86c9eb4f802b87c971864d2cfceeb7cc
parent fd660f01
This diff is collapsed.
......@@ -460,6 +460,27 @@
SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
}
/* Description : Immediate number of elements to slide
Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
Outputs - out0, out1
Return Type - as per RTYPE
Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by
value specified in the 'slide_val'
*/
#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
{ \
out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \
out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
}
#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
out0, out1, out2, slide_val) \
{ \
SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val); \
out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \
}
#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
/* Description : Shuffle byte vector elements as per mask vector
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
Outputs - out0, out1
......@@ -472,7 +493,9 @@
out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
}
#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
#define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
out0, out1, out2) \
......@@ -482,6 +505,32 @@
}
#define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
/* Description : Dot product of byte vector elements
Arguments : Inputs - mult0, mult1, cnst0, cnst1
Outputs - out0, out1
Return Type - as per RTYPE
Details : Unsigned byte elements from 'mult0' are multiplied with
unsigned byte elements from 'cnst0' producing a result
twice the size of input i.e. unsigned halfword.
The multiplication result of adjacent odd-even elements
are added together and written to the 'out0' vector
*/
#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
{ \
out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \
out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \
}
#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
cnst0, cnst1, cnst2, cnst3, \
out0, out1, out2, out3) \
{ \
DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
}
#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
/* Description : Dot product of byte vector elements
Arguments : Inputs - mult0, mult1, cnst0, cnst1
Outputs - out0, out1
......@@ -768,6 +817,7 @@
ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
}
#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
/* Description : Interleave both left and right half of input vectors
......@@ -1252,4 +1302,30 @@
out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \
out_m; \
})
/* Description : Pack even byte elements and store byte vector in destination
memory
Arguments : Inputs - in0, in1, pdst
*/
#define PCKEV_ST_SB(in0, in1, pdst) \
{ \
v16i8 tmp_m; \
tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
ST_SB(tmp_m, (pdst)); \
}
/* Description : Horizontal 2 tap filter kernel code
Arguments : Inputs - in0, in1, mask, coeff, shift
*/
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
({ \
v16i8 tmp0_m; \
v8u16 tmp1_m; \
\
tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \
tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \
\
tmp1_m; \
})
#endif /* VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ */
......@@ -225,20 +225,20 @@ $vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6;
$vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon/;
specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon msa/;
$vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6;
add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon/;
specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon msa/;
$vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6;
add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
specialize qw/vp8_bilinear_predict8x4 mmx media neon/;
specialize qw/vp8_bilinear_predict8x4 mmx media neon msa/;
$vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6;
add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
#TODO(johannkoenig): fix the neon version https://code.google.com/p/webm/issues/detail?id=892
specialize qw/vp8_bilinear_predict4x4 mmx media/;
specialize qw/vp8_bilinear_predict4x4 mmx media msa/;
$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
#
......
......@@ -114,6 +114,7 @@ VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/idct_blk_dspr2.c
VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/dequantize_dspr2.c
# common (c)
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/bilinear_filter_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/copymem_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct_msa.c
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment