Commit ec310066 authored by Yunqing Wang's avatar Yunqing Wang
Browse files

Optimize 16x16 idct function

Wrote sse2 version of vp9_short_idct16x16 function. Compared to c
version, the sse2 version is over 2.5X faster.

Change-Id: I38536e2b846427a2cc5c5423aaf305fd0e605d61
parent 66eff0aa
...@@ -295,7 +295,7 @@ prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output" ...@@ -295,7 +295,7 @@ prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_8x8 specialize vp9_short_idct1_8x8
prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch" prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct16x16 specialize vp9_short_idct16x16 sse2
prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch" prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct10_16x16 specialize vp9_short_idct10_16x16
......
This diff is collapsed.
...@@ -273,7 +273,7 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input, ...@@ -273,7 +273,7 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input,
vp9_short_iht16x16(input, output, 16, tx_type); vp9_short_iht16x16(input, output, 16, tx_type);
// the idct halves ( >> 1) the pitch // the idct halves ( >> 1) the pitch
// vp9_short_idct16x16_c(input, output, 32); // vp9_short_idct16x16(input, output, 32);
vpx_memset(input, 0, 512); vpx_memset(input, 0, 512);
...@@ -296,7 +296,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq, ...@@ -296,7 +296,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
int16_t in = input[0] * dq[0]; int16_t in = input[0] * dq[0];
int16_t out; int16_t out;
/* Note: the idct1 will need to be modified accordingly whenever /* Note: the idct1 will need to be modified accordingly whenever
* vp9_short_idct16x16_c() is modified. */ * vp9_short_idct16x16() is modified. */
vp9_short_idct1_16x16_c(&in, &out); vp9_short_idct1_16x16_c(&in, &out);
input[0] = 0; input[0] = 0;
...@@ -333,7 +333,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq, ...@@ -333,7 +333,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq,
input[i] *= dq[1]; input[i] *= dq[1];
// the idct halves ( >> 1) the pitch // the idct halves ( >> 1) the pitch
vp9_short_idct16x16_c(input, output, 16 << 1); vp9_short_idct16x16(input, output, 16 << 1);
vpx_memset(input, 0, 512); vpx_memset(input, 0, 512);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment