Commit 869d6c05 authored by Yunqing Wang's avatar Yunqing Wang
Browse files

Optimize 16x16 idct10 function

Wrote sse2 version of vp9_short_idct10_16x16 function. Compared
to c version, the sse2 version is 2.3X faster.

Change-Id: I314c4f09369648721798321eeed6f58e38857f26
parent 8a3233b5
......@@ -298,7 +298,7 @@ prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct16x16 sse2
prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct10_16x16
specialize vp9_short_idct10_16x16 sse2
prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_16x16
This diff is collapsed.
......@@ -315,7 +315,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
input[48] *= dq[1];
// the idct halves ( >> 1) the pitch
vp9_short_idct10_16x16_c(input, output, 32);
vp9_short_idct10_16x16(input, output, 32);
input[0] = input[1] = input[2] = input[3] = 0;
input[16] = input[17] = input[18] = 0;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment