Commit 81c60419 authored by Paul Wilkins's avatar Paul Wilkins
Browse files

Fix SIMD unsafe use of floating point.

This commit fixes unsafe simd / floating point interactions arising
from the current hybrid and 16x16 transform implementation.
These led to a raft of bugs and issues when the project was
built using VS2008 for Win32 though they did not show up with
the unix builds.

Gerrit makes a meal out of presenting the fix but all I have actually
done is indent the body of each function that uses floating point by
one level and bracket with emms instructions using  the function
vp8_clear_system_state(). See below.

function () {
  vp8_clear_system_state();
  {
  ... function body
  }
  vp8_clear_system_state();
}

This is almost certainly over the top in terms of number of emms
instructions but is a temporary measure pending implementation of
integer variants of each function to replace the floating point.

Limited testing suggests that this fixes the problems that arose for
Win32 VS2008 when the hybrid or 16x16 transforms were enabled.

Change-Id: I7c9a72bd79315246ed880578dec51e2b7c178442
Showing with 593 additions and 552 deletions
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
**************************************************************************/ **************************************************************************/
#include "vpx_ports/config.h" #include "vpx_ports/config.h"
#include "vp8/common/idct.h" #include "vp8/common/idct.h"
#include "vp8/common/systemdependent.h"
#if CONFIG_HYBRIDTRANSFORM #if CONFIG_HYBRIDTRANSFORM
#include "vp8/common/blockd.h" #include "vp8/common/blockd.h"
...@@ -166,92 +167,71 @@ float iadst_16[256] = { ...@@ -166,92 +167,71 @@ float iadst_16[256] = {
#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
void vp8_ihtllm_c(short *input, short *output, int pitch, void vp8_ihtllm_c(short *input, short *output, int pitch,
TX_TYPE tx_type, int tx_dim) { TX_TYPE tx_type, int tx_dim) {
int i, j, k;
float bufa[256], bufb[256]; // buffers are for floating-point test purpose
// the implementation could be simplified in
// conjunction with integer transform
// further notice, since we are thinking to use one
// function for both 4x4 and 8x8 transforms, the
// temporary buffers are simply initialized with 64.
short *ip = input;
short *op = output;
int shortpitch = pitch >> 1;
float *pfa = &bufa[0]; vp8_clear_system_state(); // Make it simd safe : __asm emms;
float *pfb = &bufb[0]; {
int i, j, k;
// pointers to vertical and horizontal transforms float bufa[256], bufb[256]; // buffers are for floating-point test purpose
float *ptv, *pth; // the implementation could be simplified in
// conjunction with integer transform
// load and convert residual array into floating-point
for(j = 0; j < tx_dim; j++) { // further notice, since we are thinking to use
for(i = 0; i < tx_dim; i++) { // one function for both 4x4 and 8x8 transforms
pfa[i] = (float)ip[i]; // the temporary buffers are simply initialized
} // with 64.
pfa += tx_dim; short *ip = input;
ip += tx_dim; short *op = output;
} int shortpitch = pitch >> 1;
// vertical transformation float *pfa = &bufa[0];
pfa = &bufa[0]; float *pfb = &bufb[0];
pfb = &bufb[0];
// pointers to vertical and horizontal transforms
switch(tx_type) { float *ptv, *pth;
case ADST_ADST :
case ADST_DCT : // load and convert residual array into floating-point
ptv = (tx_dim == 4) ? &iadst_4[0] : for(j = 0; j < tx_dim; j++) {
((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]); for(i = 0; i < tx_dim; i++) {
break; pfa[i] = (float)ip[i];
default :
ptv = (tx_dim == 4) ? &idct_4[0] :
((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
break;
}
for(j = 0; j < tx_dim; j++) {
for(i = 0; i < tx_dim; i++) {
pfb[i] = 0 ;
for(k = 0; k < tx_dim; k++) {
pfb[i] += ptv[k] * pfa[(k * tx_dim)];
} }
pfa += 1; pfa += tx_dim;
ip += tx_dim;
} }
pfb += tx_dim; // vertical transformation
ptv += tx_dim;
pfa = &bufa[0]; pfa = &bufa[0];
} pfb = &bufb[0];
// horizontal transformation switch(tx_type) {
pfa = &bufa[0]; case ADST_ADST :
pfb = &bufb[0]; case ADST_DCT :
ptv = (tx_dim == 4) ? &iadst_4[0] :
switch(tx_type) { ((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
case ADST_ADST : break;
case DCT_ADST :
pth = (tx_dim == 4) ? &iadst_4[0] :
((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
break;
default :
pth = (tx_dim == 4) ? &idct_4[0] :
((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
break;
}
for(j = 0; j < tx_dim; j++) { default :
for(i = 0; i < tx_dim; i++) { ptv = (tx_dim == 4) ? &idct_4[0] :
pfa[i] = 0; ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
for(k = 0; k < tx_dim; k++) { break;
pfa[i] += pfb[k] * pth[k]; }
for(j = 0; j < tx_dim; j++) {
for(i = 0; i < tx_dim; i++) {
pfb[i] = 0 ;
for(k = 0; k < tx_dim; k++) {
pfb[i] += ptv[k] * pfa[(k * tx_dim)];
}
pfa += 1;
} }
pth += tx_dim;
}
pfa += tx_dim; pfb += tx_dim;
pfb += tx_dim; ptv += tx_dim;
pfa = &bufa[0];
}
// horizontal transformation
pfa = &bufa[0];
pfb = &bufb[0];
switch(tx_type) { switch(tx_type) {
case ADST_ADST : case ADST_ADST :
...@@ -265,21 +245,48 @@ void vp8_ihtllm_c(short *input, short *output, int pitch, ...@@ -265,21 +245,48 @@ void vp8_ihtllm_c(short *input, short *output, int pitch,
((tx_dim == 8) ? &idct_8[0] : &idct_16[0]); ((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
break; break;
} }
}
// convert to short integer format and load BLOCKD buffer
op = output;
pfa = &bufa[0];
for(j = 0; j < tx_dim; j++) { for(j = 0; j < tx_dim; j++) {
for(i = 0; i < tx_dim; i++) { for(i = 0; i < tx_dim; i++) {
op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) : pfa[i] = 0;
-(short)( - pfa[i] / 8 + 0.49); for(k = 0; k < tx_dim; k++) {
pfa[i] += pfb[k] * pth[k];
}
pth += tx_dim;
}
pfa += tx_dim;
pfb += tx_dim;
switch(tx_type) {
case ADST_ADST :
case DCT_ADST :
pth = (tx_dim == 4) ? &iadst_4[0] :
((tx_dim == 8) ? &iadst_8[0] : &iadst_16[0]);
break;
default :
pth = (tx_dim == 4) ? &idct_4[0] :
((tx_dim == 8) ? &idct_8[0] : &idct_16[0]);
break;
}
} }
op += shortpitch; // convert to short integer format and load BLOCKD buffer
pfa += tx_dim; op = output;
pfa = &bufa[0];
for(j = 0; j < tx_dim; j++) {
for(i = 0; i < tx_dim; i++) {
op[i] = (pfa[i] > 0 ) ? (short)( pfa[i] / 8 + 0.49) :
-(short)( - pfa[i] / 8 + 0.49);
}
op += shortpitch;
pfa += tx_dim;
}
} }
vp8_clear_system_state(); // Make it simd safe : __asm emms;
} }
#endif #endif
...@@ -776,25 +783,30 @@ void vp8_short_ihaar2x2_c(short *input, short *output, int pitch) { ...@@ -776,25 +783,30 @@ void vp8_short_ihaar2x2_c(short *input, short *output, int pitch) {
#if 0 #if 0
// Keep a really bad float version as reference for now. // Keep a really bad float version as reference for now.
void vp8_short_idct16x16_c(short *input, short *output, int pitch) { void vp8_short_idct16x16_c(short *input, short *output, int pitch) {
double x;
const int short_pitch = pitch >> 1; vp8_clear_system_state(); // Make it simd safe : __asm emms;
int i, j, k, l; {
for (l = 0; l < 16; ++l) { double x;
for (k = 0; k < 16; ++k) { const int short_pitch = pitch >> 1;
double s = 0; int i, j, k, l;
for (i = 0; i < 16; ++i) { for (l = 0; l < 16; ++l) {
for (j = 0; j < 16; ++j) { for (k = 0; k < 16; ++k) {
x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32; double s = 0;
if (i != 0) for (i = 0; i < 16; ++i) {
x *= sqrt(2.0); for (j = 0; j < 16; ++j) {
if (j != 0) x=cos(PI*j*(l+0.5)/16.0)*cos(PI*i*(k+0.5)/16.0)*input[i*16+j]/32;
x *= sqrt(2.0); if (i != 0)
s += x; x *= sqrt(2.0);
if (j != 0)
x *= sqrt(2.0);
s += x;
}
} }
output[k*short_pitch+l] = (short)round(s);
} }
output[k*short_pitch+l] = (short)round(s);
} }
} }
vp8_clear_system_state(); // Make it simd safe : __asm emms;
} }
#endif #endif
...@@ -816,231 +828,246 @@ static const double C15 = 0.098017140329561; ...@@ -816,231 +828,246 @@ static const double C15 = 0.098017140329561;
static void butterfly_16x16_idct_1d(double input[16], double output[16]) { static void butterfly_16x16_idct_1d(double input[16], double output[16]) {
double step[16];
double intermediate[16]; vp8_clear_system_state(); // Make it simd safe : __asm emms;
double temp1, temp2; {
double step[16];
double intermediate[16];
// step 1 and 2 double temp1, temp2;
step[ 0] = input[0] + input[8];
step[ 1] = input[0] - input[8];
// step 1 and 2
temp1 = input[4]*C12; step[ 0] = input[0] + input[8];
temp2 = input[12]*C4; step[ 1] = input[0] - input[8];
temp1 -= temp2; temp1 = input[4]*C12;
temp1 *= C8; temp2 = input[12]*C4;
step[ 2] = 2*(temp1); temp1 -= temp2;
temp1 *= C8;
temp1 = input[4]*C4;
temp2 = input[12]*C12; step[ 2] = 2*(temp1);
temp1 += temp2;
temp1 = (temp1); temp1 = input[4]*C4;
temp1 *= C8; temp2 = input[12]*C12;
step[ 3] = 2*(temp1); temp1 += temp2;
temp1 = (temp1);
temp1 = input[2]*C8; temp1 *= C8;
temp1 = 2*(temp1); step[ 3] = 2*(temp1);
temp2 = input[6] + input[10];
temp1 = input[2]*C8;
step[ 4] = temp1 + temp2; temp1 = 2*(temp1);
step[ 5] = temp1 - temp2; temp2 = input[6] + input[10];
temp1 = input[14]*C8; step[ 4] = temp1 + temp2;
temp1 = 2*(temp1); step[ 5] = temp1 - temp2;
temp2 = input[6] - input[10];
temp1 = input[14]*C8;
step[ 6] = temp2 - temp1; temp1 = 2*(temp1);
step[ 7] = temp2 + temp1; temp2 = input[6] - input[10];
// for odd input step[ 6] = temp2 - temp1;
temp1 = input[3]*C12; step[ 7] = temp2 + temp1;
temp2 = input[13]*C4;
temp1 += temp2; // for odd input
temp1 = (temp1); temp1 = input[3]*C12;
temp1 *= C8; temp2 = input[13]*C4;
intermediate[ 8] = 2*(temp1); temp1 += temp2;
temp1 = (temp1);
temp1 = input[3]*C4; temp1 *= C8;
temp2 = input[13]*C12; intermediate[ 8] = 2*(temp1);
temp2 -= temp1;
temp2 = (temp2); temp1 = input[3]*C4;
temp2 *= C8; temp2 = input[13]*C12;
intermediate[ 9] = 2*(temp2); temp2 -= temp1;
temp2 = (temp2);
intermediate[10] = 2*(input[9]*C8); temp2 *= C8;
intermediate[11] = input[15] - input[1]; intermediate[ 9] = 2*(temp2);
intermediate[12] = input[15] + input[1];
intermediate[13] = 2*((input[7]*C8)); intermediate[10] = 2*(input[9]*C8);
intermediate[11] = input[15] - input[1];
temp1 = input[11]*C12; intermediate[12] = input[15] + input[1];
temp2 = input[5]*C4; intermediate[13] = 2*((input[7]*C8));
temp2 -= temp1;
temp2 = (temp2); temp1 = input[11]*C12;
temp2 *= C8; temp2 = input[5]*C4;
intermediate[14] = 2*(temp2); temp2 -= temp1;
temp2 = (temp2);
temp1 = input[11]*C4; temp2 *= C8;
temp2 = input[5]*C12; intermediate[14] = 2*(temp2);
temp1 += temp2;
temp1 = (temp1); temp1 = input[11]*C4;
temp1 *= C8; temp2 = input[5]*C12;
intermediate[15] = 2*(temp1); temp1 += temp2;
temp1 = (temp1);
step[ 8] = intermediate[ 8] + intermediate[14]; temp1 *= C8;
step[ 9] = intermediate[ 9] + intermediate[15]; intermediate[15] = 2*(temp1);
step[10] = intermediate[10] + intermediate[11];
step[11] = intermediate[10] - intermediate[11]; step[ 8] = intermediate[ 8] + intermediate[14];
step[12] = intermediate[12] + intermediate[13]; step[ 9] = intermediate[ 9] + intermediate[15];
step[13] = intermediate[12] - intermediate[13]; step[10] = intermediate[10] + intermediate[11];
step[14] = intermediate[ 8] - intermediate[14]; step[11] = intermediate[10] - intermediate[11];
step[15] = intermediate[ 9] - intermediate[15]; step[12] = intermediate[12] + intermediate[13];
step[13] = intermediate[12] - intermediate[13];
// step 3 step[14] = intermediate[ 8] - intermediate[14];
output[0] = step[ 0] + step[ 3]; step[15] = intermediate[ 9] - intermediate[15];
output[1] = step[ 1] + step[ 2];
output[2] = step[ 1] - step[ 2]; // step 3
output[3] = step[ 0] - step[ 3]; output[0] = step[ 0] + step[ 3];
output[1] = step[ 1] + step[ 2];
temp1 = step[ 4]*C14; output[2] = step[ 1] - step[ 2];
temp2 = step[ 7]*C2; output[3] = step[ 0] - step[ 3];
temp1 -= temp2;
output[4] = (temp1); temp1 = step[ 4]*C14;
temp2 = step[ 7]*C2;
temp1 = step[ 4]*C2; temp1 -= temp2;
temp2 = step[ 7]*C14; output[4] = (temp1);
temp1 += temp2;
output[7] = (temp1); temp1 = step[ 4]*C2;
temp2 = step[ 7]*C14;
temp1 = step[ 5]*C10; temp1 += temp2;
temp2 = step[ 6]*C6; output[7] = (temp1);
temp1 -= temp2;
output[5] = (temp1); temp1 = step[ 5]*C10;
temp2 = step[ 6]*C6;
temp1 = step[ 5]*C6; temp1 -= temp2;
temp2 = step[ 6]*C10; output[5] = (temp1);
temp1 += temp2;
output[6] = (temp1); temp1 = step[ 5]*C6;
temp2 = step[ 6]*C10;
output[8] = step[ 8] + step[11]; temp1 += temp2;
output[9] = step[ 9] + step[10]; output[6] = (temp1);
output[10] = step[ 9] - step[10];
output[11] = step[ 8] - step[11]; output[8] = step[ 8] + step[11];
output[12] = step[12] + step[15]; output[9] = step[ 9] + step[10];
output[13] = step[13] + step[14]; output[10] = step[ 9] - step[10];
output[14] = step[13] - step[14]; output[11] = step[ 8] - step[11];
output[15] = step[12] - step[15]; output[12] = step[12] + step[15];
output[13] = step[13] + step[14];
// output 4 output[14] = step[13] - step[14];
step[ 0] = output[0] + output[7]; output[15] = step[12] - step[15];
step[ 1] = output[1] + output[6];
step[ 2] = output[2] + output[5]; // output 4
step[ 3] = output[3] + output[4]; step[ 0] = output[0] + output[7];
step[ 4] = output[3] - output[4]; step[ 1] = output[1] + output[6];
step[ 5] = output[2] - output[5]; step[ 2] = output[2] + output[5];
step[ 6] = output[1] - output[6]; step[ 3] = output[3] + output[4];
step[ 7] = output[0] - output[7]; step[ 4] = output[3] - output[4];
step[ 5] = output[2] - output[5];
temp1 = output[8]*C7; step[ 6] = output[1] - output[6];
temp2 = output[15]*C9; step[ 7] = output[0] - output[7];
temp1 -= temp2;
step[ 8] = (temp1); temp1 = output[8]*C7;
temp2 = output[15]*C9;
temp1 = output[9]*C11; temp1 -= temp2;
temp2 = output[14]*C5; step[ 8] = (temp1);
temp1 += temp2;
step[ 9] = (temp1); temp1 = output[9]*C11;
temp2 = output[14]*C5;
temp1 = output[10]*C3; temp1 += temp2;
temp2 = output[13]*C13; step[ 9] = (temp1);
temp1 -= temp2;
step[10] = (temp1); temp1 = output[10]*C3;
temp2 = output[13]*C13;
temp1 = output[11]*C15; temp1 -= temp2;
temp2 = output[12]*C1; step[10] = (temp1);
temp1 += temp2;
step[11] = (temp1); temp1 = output[11]*C15;
temp2 = output[12]*C1;
temp1 = output[11]*C1; temp1 += temp2;
temp2 = output[12]*C15; step[11] = (temp1);
temp2 -= temp1;
step[12] = (temp2); temp1 = output[11]*C1;
temp2 = output[12]*C15;
temp1 = output[10]*C13; temp2 -= temp1;
temp2 = output[13]*C3; step[12] = (temp2);
temp1 += temp2;
step[13] = (temp1); temp1 = output[10]*C13;
temp2 = output[13]*C3;
temp1 = output[9]*C5; temp1 += temp2;
temp2 = output[14]*C11; step[13] = (temp1);
temp2 -= temp1;
step[14] = (temp2); temp1 = output[9]*C5;
temp2 = output[14]*C11;
temp1 = output[8]*C9; temp2 -= temp1;
temp2 = output[15]*C7; step[14] = (temp2);
temp1 += temp2;
step[15] = (temp1); temp1 = output[8]*C9;
temp2 = output[15]*C7;
// step 5 temp1 += temp2;
output[0] = (step[0] + step[15]); step[15] = (temp1);
output[1] = (step[1] + step[14]);
output[2] = (step[2] + step[13]); // step 5
output[3] = (step[3] + step[12]); output[0] = (step[0] + step[15]);
output[4] = (step[4] + step[11]); output[1] = (step[1] + step[14]);
output[5] = (step[5] + step[10]); output[2] = (step[2] + step[13]);
output[6] = (step[6] + step[ 9]); output[3] = (step[3] + step[12]);
output[7] = (step[7] + step[ 8]); output[4] = (step[4] + step[11]);
output[5] = (step[5] + step[10]);
output[15] = (step[0] - step[15]); output[6] = (step[6] + step[ 9]);
output[14] = (step[1] - step[14]); output[7] = (step[7] + step[ 8]);
output[13] = (step[2] - step[13]);
output[12] = (step[3] - step[12]); output[15] = (step[0] - step[15]);
output[11] = (step[4] - step[11]); output[14] = (step[1] - step[14]);
output[10] = (step[5] - step[10]); output[13] = (step[2] - step[13]);
output[9] = (step[6] - step[ 9]); output[12] = (step[3] - step[12]);
output[8] = (step[7] - step[ 8]); output[11] = (step[4] - step[11]);
output[10] = (step[5] - step[10]);
output[9] = (step[6] - step[ 9]);
output[8] = (step[7] - step[ 8]);
}
vp8_clear_system_state(); // Make it simd safe : __asm emms;
} }
// Remove once an int version of iDCT is written // Remove once an int version of iDCT is written
#if 0 #if 0
void reference_16x16_idct_1d(double input[16], double output[16]) { void reference_16x16_idct_1d(double input[16], double output[16]) {
const double kPi = 3.141592653589793238462643383279502884;
const double kSqrt2 = 1.414213562373095048801688724209698; vp8_clear_system_state(); // Make it simd safe : __asm emms;
for (int k = 0; k < 16; k++) { {
output[k] = 0.0; const double kPi = 3.141592653589793238462643383279502884;
for (int n = 0; n < 16; n++) { const double kSqrt2 = 1.414213562373095048801688724209698;
output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0); for (int k = 0; k < 16; k++) {
if (n == 0) output[k] = 0.0;
output[k] = output[k]/kSqrt2; for (int n = 0; n < 16; n++) {
output[k] += input[n]*cos(kPi*(2*k+1)*n/32.0);
if (n == 0)
output[k] = output[k]/kSqrt2;
}
} }
} }
vp8_clear_system_state(); // Make it simd safe : __asm emms;
} }
#endif #endif
void vp8_short_idct16x16_c(short *input, short *output, int pitch) { void vp8_short_idct16x16_c(short *input, short *output, int pitch) {
double out[16*16], out2[16*16];
const int short_pitch = pitch >> 1; vp8_clear_system_state(); // Make it simd safe : __asm emms;
int i, j; {
// First transform rows double out[16*16], out2[16*16];
for (i = 0; i < 16; ++i) { const int short_pitch = pitch >> 1;
double temp_in[16], temp_out[16]; int i, j;
for (j = 0; j < 16; ++j) // First transform rows
temp_in[j] = input[j + i*short_pitch]; for (i = 0; i < 16; ++i) {
butterfly_16x16_idct_1d(temp_in, temp_out); double temp_in[16], temp_out[16];
for (j = 0; j < 16; ++j) for (j = 0; j < 16; ++j)
out[j + i*16] = temp_out[j]; temp_in[j] = input[j + i*short_pitch];
} butterfly_16x16_idct_1d(temp_in, temp_out);
// Then transform columns for (j = 0; j < 16; ++j)
for (i = 0; i < 16; ++i) { out[j + i*16] = temp_out[j];
double temp_in[16], temp_out[16]; }
for (j = 0; j < 16; ++j) // Then transform columns
temp_in[j] = out[j*16 + i]; for (i = 0; i < 16; ++i) {
butterfly_16x16_idct_1d(temp_in, temp_out); double temp_in[16], temp_out[16];
for (j = 0; j < 16; ++j) for (j = 0; j < 16; ++j)
out2[j*16 + i] = temp_out[j]; temp_in[j] = out[j*16 + i];
butterfly_16x16_idct_1d(temp_in, temp_out);
for (j = 0; j < 16; ++j)
out2[j*16 + i] = temp_out[j];
}
for (i = 0; i < 16*16; ++i)
output[i] = round(out2[i]/128);
} }
for (i = 0; i < 16*16; ++i) vp8_clear_system_state(); // Make it simd safe : __asm emms;
output[i] = round(out2[i]/128);
} }
#endif #endif
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <math.h> #include <math.h>
#include "vpx_ports/config.h" #include "vpx_ports/config.h"
#include "vp8/common/idct.h" #include "vp8/common/idct.h"
#include "vp8/common/systemdependent.h"
#if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16 #if CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM16X16
...@@ -402,87 +403,64 @@ void vp8_short_fhaar2x2_c(short *input, short *output, int pitch) { // pitch = 8 ...@@ -402,87 +403,64 @@ void vp8_short_fhaar2x2_c(short *input, short *output, int pitch) { // pitch = 8
#if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16 #if CONFIG_HYBRIDTRANSFORM8X8 || CONFIG_HYBRIDTRANSFORM || CONFIG_HYBRIDTRANSFORM16X16
void vp8_fht_c(short *input, short *output, int pitch, void vp8_fht_c(short *input, short *output, int pitch,
TX_TYPE tx_type, int tx_dim) { TX_TYPE tx_type, int tx_dim) {
int i, j, k;
float bufa[256], bufb[256]; // buffers are for floating-point test purpose
// the implementation could be simplified in
// conjunction with integer transform
short *ip = input;
short *op = output;
float *pfa = &bufa[0];
float *pfb = &bufb[0];
// pointers to vertical and horizontal transforms
float *ptv, *pth;
// load and convert residual array into floating-point
for(j = 0; j < tx_dim; j++) {
for(i = 0; i < tx_dim; i++) {
pfa[i] = (float)ip[i];
}
pfa += tx_dim;
ip += pitch / 2;
}
// vertical transformation
pfa = &bufa[0];
pfb = &bufb[0];
switch(tx_type) {
case ADST_ADST :
case ADST_DCT :
ptv = (tx_dim == 4) ? &adst_4[0] :
((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
break;
default :
ptv = (tx_dim == 4) ? &dct_4[0] :
((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
break;
}
for(j = 0; j < tx_dim; j++) { vp8_clear_system_state(); // Make it simd safe : __asm emms;
for(i = 0; i < tx_dim; i++) { {
pfb[i] = 0; int i, j, k;
for(k = 0; k < tx_dim; k++) { float bufa[256], bufb[256]; // buffers are for floating-point test purpose
pfb[i] += ptv[k] * pfa[(k * tx_dim)]; // the implementation could be simplified in
// conjunction with integer transform
short *ip = input;
short *op = output;
float *pfa = &bufa[0];
float *pfb = &bufb[0];
// pointers to vertical and horizontal transforms
float *ptv, *pth;
// load and convert residual array into floating-point
for(j = 0; j < tx_dim; j++) {
for(i = 0; i < tx_dim; i++) {
pfa[i] = (float)ip[i];
} }
pfa += 1; pfa += tx_dim;
ip += pitch / 2;
} }
pfb += tx_dim;
ptv += tx_dim; // vertical transformation
pfa = &bufa[0]; pfa = &bufa[0];
} pfb = &bufb[0];
// horizontal transformation switch(tx_type) {
pfa = &bufa[0]; case ADST_ADST :
pfb = &bufb[0]; case ADST_DCT :
ptv = (tx_dim == 4) ? &adst_4[0] :
switch(tx_type) { ((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
case ADST_ADST : break;
case DCT_ADST :
pth = (tx_dim == 4) ? &adst_4[0] :
((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
break;
default :
pth = (tx_dim == 4) ? &dct_4[0] :
((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
break;
}
for(j = 0; j < tx_dim; j++) { default :
for(i = 0; i < tx_dim; i++) { ptv = (tx_dim == 4) ? &dct_4[0] :
pfa[i] = 0; ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
for(k = 0; k < tx_dim; k++) { break;
pfa[i] += pfb[k] * pth[k]; }
for(j = 0; j < tx_dim; j++) {
for(i = 0; i < tx_dim; i++) {
pfb[i] = 0;
for(k = 0; k < tx_dim; k++) {
pfb[i] += ptv[k] * pfa[(k * tx_dim)];
}
pfa += 1;
} }
pth += tx_dim; pfb += tx_dim;
ptv += tx_dim;
pfa = &bufa[0];
} }
pfa += tx_dim; // horizontal transformation
pfb += tx_dim; pfa = &bufa[0];
// pth -= tx_dim * tx_dim; pfb = &bufb[0];
switch(tx_type) { switch(tx_type) {
case ADST_ADST : case ADST_ADST :
...@@ -496,20 +474,48 @@ void vp8_fht_c(short *input, short *output, int pitch, ...@@ -496,20 +474,48 @@ void vp8_fht_c(short *input, short *output, int pitch,
((tx_dim == 8) ? &dct_8[0] : &dct_16[0]); ((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
break; break;
} }
}
// convert to short integer format and load BLOCKD buffer for(j = 0; j < tx_dim; j++) {
op = output ; for(i = 0; i < tx_dim; i++) {
pfa = &bufa[0] ; pfa[i] = 0;
for(k = 0; k < tx_dim; k++) {
pfa[i] += pfb[k] * pth[k];
}
pth += tx_dim;
}
for(j = 0; j < tx_dim; j++) { pfa += tx_dim;
for(i = 0; i < tx_dim; i++) { pfb += tx_dim;
op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) : // pth -= tx_dim * tx_dim;
-(short)(- 8 * pfa[i] + 0.49);
switch(tx_type) {
case ADST_ADST :
case DCT_ADST :
pth = (tx_dim == 4) ? &adst_4[0] :
((tx_dim == 8) ? &adst_8[0] : &adst_16[0]);
break;
default :
pth = (tx_dim == 4) ? &dct_4[0] :
((tx_dim == 8) ? &dct_8[0] : &dct_16[0]);
break;
}
}
// convert to short integer format and load BLOCKD buffer
op = output ;
pfa = &bufa[0] ;
for(j = 0; j < tx_dim; j++) {
for(i = 0; i < tx_dim; i++) {
op[i] = (pfa[i] > 0 ) ? (short)( 8 * pfa[i] + 0.49) :
-(short)(- 8 * pfa[i] + 0.49);
}
op += tx_dim;
pfa += tx_dim;
} }
op += tx_dim;
pfa += tx_dim;
} }
vp8_clear_system_state(); // Make it simd safe : __asm emms;
} }
#endif #endif
...@@ -705,162 +711,168 @@ static const double C14 = 0.195090322016128; ...@@ -705,162 +711,168 @@ static const double C14 = 0.195090322016128;
static const double C15 = 0.098017140329561; static const double C15 = 0.098017140329561;
static void dct16x16_1d(double input[16], double output[16]) { static void dct16x16_1d(double input[16], double output[16]) {
double step[16]; vp8_clear_system_state(); // Make it simd safe : __asm emms;
double intermediate[16]; {
double temp1, temp2; double step[16];
double intermediate[16];
// step 1 double temp1, temp2;
step[ 0] = input[0] + input[15];
step[ 1] = input[1] + input[14]; // step 1
step[ 2] = input[2] + input[13]; step[ 0] = input[0] + input[15];
step[ 3] = input[3] + input[12]; step[ 1] = input[1] + input[14];
step[ 4] = input[4] + input[11]; step[ 2] = input[2] + input[13];
step[ 5] = input[5] + input[10]; step[ 3] = input[3] + input[12];
step[ 6] = input[6] + input[ 9]; step[ 4] = input[4] + input[11];
step[ 7] = input[7] + input[ 8]; step[ 5] = input[5] + input[10];
step[ 8] = input[7] - input[ 8]; step[ 6] = input[6] + input[ 9];
step[ 9] = input[6] - input[ 9]; step[ 7] = input[7] + input[ 8];
step[10] = input[5] - input[10]; step[ 8] = input[7] - input[ 8];
step[11] = input[4] - input[11]; step[ 9] = input[6] - input[ 9];
step[12] = input[3] - input[12]; step[10] = input[5] - input[10];
step[13] = input[2] - input[13]; step[11] = input[4] - input[11];
step[14] = input[1] - input[14]; step[12] = input[3] - input[12];
step[15] = input[0] - input[15]; step[13] = input[2] - input[13];
step[14] = input[1] - input[14];
// step 2 step[15] = input[0] - input[15];
output[0] = step[0] + step[7];
output[1] = step[1] + step[6]; // step 2
output[2] = step[2] + step[5]; output[0] = step[0] + step[7];
output[3] = step[3] + step[4]; output[1] = step[1] + step[6];
output[4] = step[3] - step[4]; output[2] = step[2] + step[5];
output[5] = step[2] - step[5]; output[3] = step[3] + step[4];
output[6] = step[1] - step[6]; output[4] = step[3] - step[4];
output[7] = step[0] - step[7]; output[5] = step[2] - step[5];
output[6] = step[1] - step[6];
temp1 = step[ 8]*C7; output[7] = step[0] - step[7];
temp2 = step[15]*C9;
output[ 8] = temp1 + temp2; temp1 = step[ 8]*C7;
temp2 = step[15]*C9;
temp1 = step[ 9]*C11; output[ 8] = temp1 + temp2;
temp2 = step[14]*C5;
output[ 9] = temp1 - temp2; temp1 = step[ 9]*C11;
temp2 = step[14]*C5;
temp1 = step[10]*C3; output[ 9] = temp1 - temp2;
temp2 = step[13]*C13;
output[10] = temp1 + temp2; temp1 = step[10]*C3;
temp2 = step[13]*C13;
temp1 = step[11]*C15; output[10] = temp1 + temp2;
temp2 = step[12]*C1;
output[11] = temp1 - temp2; temp1 = step[11]*C15;
temp2 = step[12]*C1;
temp1 = step[11]*C1; output[11] = temp1 - temp2;
temp2 = step[12]*C15;
output[12] = temp2 + temp1; temp1 = step[11]*C1;
temp2 = step[12]*C15;
temp1 = step[10]*C13; output[12] = temp2 + temp1;
temp2 = step[13]*C3;
output[13] = temp2 - temp1; temp1 = step[10]*C13;
temp2 = step[13]*C3;
temp1 = step[ 9]*C5; output[13] = temp2 - temp1;
temp2 = step[14]*C11;
output[14] = temp2 + temp1; temp1 = step[ 9]*C5;
temp2 = step[14]*C11;
temp1 = step[ 8]*C9; output[14] = temp2 + temp1;
temp2 = step[15]*C7;
output[15] = temp2 - temp1; temp1 = step[ 8]*C9;
temp2 = step[15]*C7;
// step 3 output[15] = temp2 - temp1;
step[ 0] = output[0] + output[3];
step[ 1] = output[1] + output[2]; // step 3
step[ 2] = output[1] - output[2]; step[ 0] = output[0] + output[3];
step[ 3] = output[0] - output[3]; step[ 1] = output[1] + output[2];
step[ 2] = output[1] - output[2];
temp1 = output[4]*C14; step[ 3] = output[0] - output[3];
temp2 = output[7]*C2;
step[ 4] = temp1 + temp2; temp1 = output[4]*C14;
temp2 = output[7]*C2;
temp1 = output[5]*C10; step[ 4] = temp1 + temp2;
temp2 = output[6]*C6;
step[ 5] = temp1 + temp2; temp1 = output[5]*C10;
temp2 = output[6]*C6;
temp1 = output[5]*C6; step[ 5] = temp1 + temp2;
temp2 = output[6]*C10;
step[ 6] = temp2 - temp1; temp1 = output[5]*C6;
temp2 = output[6]*C10;
temp1 = output[4]*C2; step[ 6] = temp2 - temp1;
temp2 = output[7]*C14;
step[ 7] = temp2 - temp1; temp1 = output[4]*C2;
temp2 = output[7]*C14;
step[ 8] = output[ 8] + output[11]; step[ 7] = temp2 - temp1;
step[ 9] = output[ 9] + output[10];
step[10] = output[ 9] - output[10]; step[ 8] = output[ 8] + output[11];
step[11] = output[ 8] - output[11]; step[ 9] = output[ 9] + output[10];
step[10] = output[ 9] - output[10];
step[12] = output[12] + output[15]; step[11] = output[ 8] - output[11];
step[13] = output[13] + output[14];
step[14] = output[13] - output[14]; step[12] = output[12] + output[15];
step[15] = output[12] - output[15]; step[13] = output[13] + output[14];
step[14] = output[13] - output[14];
// step 4 step[15] = output[12] - output[15];
output[ 0] = (step[ 0] + step[ 1]);
output[ 8] = (step[ 0] - step[ 1]); // step 4
output[ 0] = (step[ 0] + step[ 1]);
temp1 = step[2]*C12; output[ 8] = (step[ 0] - step[ 1]);
temp2 = step[3]*C4;
temp1 = temp1 + temp2; temp1 = step[2]*C12;
output[ 4] = 2*(temp1*C8); temp2 = step[3]*C4;
temp1 = temp1 + temp2;
temp1 = step[2]*C4; output[ 4] = 2*(temp1*C8);
temp2 = step[3]*C12;
temp1 = temp2 - temp1; temp1 = step[2]*C4;
output[12] = 2*(temp1*C8); temp2 = step[3]*C12;
temp1 = temp2 - temp1;
output[ 2] = 2*((step[4] + step[ 5])*C8); output[12] = 2*(temp1*C8);
output[14] = 2*((step[7] - step[ 6])*C8);
output[ 2] = 2*((step[4] + step[ 5])*C8);
temp1 = step[4] - step[5]; output[14] = 2*((step[7] - step[ 6])*C8);
temp2 = step[6] + step[7];
output[ 6] = (temp1 + temp2); temp1 = step[4] - step[5];
output[10] = (temp1 - temp2); temp2 = step[6] + step[7];
output[ 6] = (temp1 + temp2);
intermediate[8] = step[8] + step[14]; output[10] = (temp1 - temp2);
intermediate[9] = step[9] + step[15];
intermediate[8] = step[8] + step[14];
temp1 = intermediate[8]*C12; intermediate[9] = step[9] + step[15];
temp2 = intermediate[9]*C4;
temp1 = temp1 - temp2; temp1 = intermediate[8]*C12;
output[3] = 2*(temp1*C8); temp2 = intermediate[9]*C4;
temp1 = temp1 - temp2;
temp1 = intermediate[8]*C4; output[3] = 2*(temp1*C8);
temp2 = intermediate[9]*C12;
temp1 = temp2 + temp1; temp1 = intermediate[8]*C4;
output[13] = 2*(temp1*C8); temp2 = intermediate[9]*C12;
temp1 = temp2 + temp1;
output[ 9] = 2*((step[10] + step[11])*C8); output[13] = 2*(temp1*C8);
intermediate[11] = step[10] - step[11]; output[ 9] = 2*((step[10] + step[11])*C8);
intermediate[12] = step[12] + step[13];
intermediate[13] = step[12] - step[13]; intermediate[11] = step[10] - step[11];
intermediate[14] = step[ 8] - step[14]; intermediate[12] = step[12] + step[13];
intermediate[15] = step[ 9] - step[15]; intermediate[13] = step[12] - step[13];
intermediate[14] = step[ 8] - step[14];
output[15] = (intermediate[11] + intermediate[12]); intermediate[15] = step[ 9] - step[15];
output[ 1] = -(intermediate[11] - intermediate[12]);
output[15] = (intermediate[11] + intermediate[12]);
output[ 7] = 2*(intermediate[13]*C8); output[ 1] = -(intermediate[11] - intermediate[12]);
temp1 = intermediate[14]*C12; output[ 7] = 2*(intermediate[13]*C8);
temp2 = intermediate[15]*C4;
temp1 = temp1 - temp2; temp1 = intermediate[14]*C12;
output[11] = -2*(temp1*C8); temp2 = intermediate[15]*C4;
temp1 = temp1 - temp2;
temp1 = intermediate[14]*C4; output[11] = -2*(temp1*C8);
temp2 = intermediate[15]*C12;
temp1 = temp2 + temp1; temp1 = intermediate[14]*C4;
output[ 5] = 2*(temp1*C8); temp2 = intermediate[15]*C12;
temp1 = temp2 + temp1;
output[ 5] = 2*(temp1*C8);
}
vp8_clear_system_state(); // Make it simd safe : __asm emms;
} }
void vp8_short_fdct16x16_c(short *input, short *out, int pitch) { void vp8_short_fdct16x16_c(short *input, short *out, int pitch) {
vp8_clear_system_state(); // Make it simd safe : __asm emms;
{
int shortpitch = pitch >> 1; int shortpitch = pitch >> 1;
int i, j; int i, j;
double output[256]; double output[256];
...@@ -885,5 +897,7 @@ void vp8_short_fdct16x16_c(short *input, short *out, int pitch) { ...@@ -885,5 +897,7 @@ void vp8_short_fdct16x16_c(short *input, short *out, int pitch) {
// Scale by some magic number // Scale by some magic number
for (i = 0; i < 256; i++) for (i = 0; i < 256; i++)
out[i] = (short)round(output[i]/2); out[i] = (short)round(output[i]/2);
}
vp8_clear_system_state(); // Make it simd safe : __asm emms;
} }
#endif #endif
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment