Commit 005fc697 authored by Dmitry Kovalev's avatar Dmitry Kovalev

Finally removing "short" from transform names.

Change-Id: I5259b68dc1bcceb153e3ffe638a79a59a3019e9d
parent 4d8ebc9e
......@@ -273,7 +273,7 @@ void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
}
void fht16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
vp9_short_fht16x16_c(in, out, stride, tx_type);
vp9_fht16x16_c(in, out, stride, tx_type);
}
class Trans16x16TestBase {
......@@ -507,10 +507,10 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
C, Trans16x16HT,
::testing::Values(
make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 0),
make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 1),
make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
make_tuple(&vp9_short_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
......@@ -521,9 +521,9 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE2, Trans16x16HT,
::testing::Values(
make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0),
make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1),
make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2),
make_tuple(&vp9_short_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3)));
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2),
make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3)));
#endif
} // namespace
......@@ -45,7 +45,7 @@ void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
}
void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
vp9_short_fht4x4_c(in, out, stride, tx_type);
vp9_fht4x4_c(in, out, stride, tx_type);
}
class Trans4x4TestBase {
......@@ -281,10 +281,10 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
C, Trans4x4HT,
::testing::Values(
make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 0),
make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0),
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
......@@ -295,10 +295,10 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE2, Trans4x4HT,
::testing::Values(
make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),
make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),
make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),
make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
#endif
} // namespace
......@@ -44,7 +44,7 @@ void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
}
void fht8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
vp9_short_fht8x8_c(in, out, stride, tx_type);
vp9_fht8x8_c(in, out, stride, tx_type);
}
class FwdTrans8x8TestBase {
......@@ -308,10 +308,10 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8HT,
::testing::Values(
make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 0),
make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 1),
make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
make_tuple(&vp9_short_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
......@@ -321,9 +321,9 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8HT,
::testing::Values(
make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0),
make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1),
make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
make_tuple(&vp9_short_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
#endif
} // namespace
......@@ -707,14 +707,14 @@ if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
fi
# fdct functions
prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_short_fht4x4 sse2 avx2
prototype void vp9_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_fht4x4 sse2 avx2
prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_short_fht8x8 sse2 avx2
prototype void vp9_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_fht8x8 sse2 avx2
prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_short_fht16x16 sse2 avx2
prototype void vp9_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
specialize vp9_fht16x16 sse2 avx2
prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
specialize vp9_fwht4x4
......
......@@ -18,8 +18,6 @@
#include "vp9/common/vp9_idct.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/encoder/vp9_dct.h"
static INLINE int fdct_round_shift(int input) {
int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
assert(INT16_MIN <= rv && rv <= INT16_MAX);
......@@ -157,32 +155,36 @@ static const transform_2d FHT_4[] = {
{ fadst4, fadst4 } // ADST_ADST = 3
};
void vp9_short_fht4x4_c(const int16_t *input, int16_t *output,
int stride, int tx_type) {
int16_t out[4 * 4];
int16_t *outptr = &out[0];
int i, j;
int16_t temp_in[4], temp_out[4];
const transform_2d ht = FHT_4[tx_type];
void vp9_fht4x4_c(const int16_t *input, int16_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct4x4_c(input, output, stride);
} else {
int16_t out[4 * 4];
int16_t *outptr = &out[0];
int i, j;
int16_t temp_in[4], temp_out[4];
const transform_2d ht = FHT_4[tx_type];
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = input[j * stride + i] * 16;
if (i == 0 && temp_in[0])
temp_in[0] += 1;
ht.cols(temp_in, temp_out);
for (j = 0; j < 4; ++j)
outptr[j * 4 + i] = temp_out[j];
}
// Columns
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = input[j * stride + i] * 16;
if (i == 0 && temp_in[0])
temp_in[0] += 1;
ht.cols(temp_in, temp_out);
for (j = 0; j < 4; ++j)
outptr[j * 4 + i] = temp_out[j];
}
// Rows
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j + i * 4];
ht.rows(temp_in, temp_out);
for (j = 0; j < 4; ++j)
output[j + i * 4] = (temp_out[j] + 1) >> 2;
// Rows
for (i = 0; i < 4; ++i) {
for (j = 0; j < 4; ++j)
temp_in[j] = out[j + i * 4];
ht.rows(temp_in, temp_out);
for (j = 0; j < 4; ++j)
output[j + i * 4] = (temp_out[j] + 1) >> 2;
}
}
}
......@@ -565,30 +567,34 @@ static const transform_2d FHT_8[] = {
{ fadst8, fadst8 } // ADST_ADST = 3
};
void vp9_short_fht8x8_c(const int16_t *input, int16_t *output,
int stride, int tx_type) {
int16_t out[64];
int16_t *outptr = &out[0];
int i, j;
int16_t temp_in[8], temp_out[8];
const transform_2d ht = FHT_8[tx_type];
// Columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
outptr[j * 8 + i] = temp_out[j];
}
void vp9_fht8x8_c(const int16_t *input, int16_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct8x8_c(input, output, stride);
} else {
int16_t out[64];
int16_t *outptr = &out[0];
int i, j;
int16_t temp_in[8], temp_out[8];
const transform_2d ht = FHT_8[tx_type];
// Columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
outptr[j * 8 + i] = temp_out[j];
}
// Rows
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j + i * 8];
ht.rows(temp_in, temp_out);
for (j = 0; j < 8; ++j)
output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
// Rows
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j + i * 8];
ht.rows(temp_in, temp_out);
for (j = 0; j < 8; ++j)
output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
}
}
}
......@@ -958,31 +964,34 @@ static const transform_2d FHT_16[] = {
{ fadst16, fadst16 } // ADST_ADST = 3
};
void vp9_short_fht16x16_c(const int16_t *input, int16_t *output,
int stride, int tx_type) {
int16_t out[256];
int16_t *outptr = &out[0];
int i, j;
int16_t temp_in[16], temp_out[16];
const transform_2d ht = FHT_16[tx_type];
// Columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
// outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
void vp9_fht16x16_c(const int16_t *input, int16_t *output,
int stride, int tx_type) {
if (tx_type == DCT_DCT) {
vp9_fdct16x16_c(input, output, stride);
} else {
int16_t out[256];
int16_t *outptr = &out[0];
int i, j;
int16_t temp_in[16], temp_out[16];
const transform_2d ht = FHT_16[tx_type];
// Columns
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 16; ++j)
outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
}
// Rows
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j + i * 16];
ht.rows(temp_in, temp_out);
for (j = 0; j < 16; ++j)
output[j + i * 16] = temp_out[j];
// Rows
for (i = 0; i < 16; ++i) {
for (j = 0; j < 16; ++j)
temp_in[j] = out[j + i * 16];
ht.rows(temp_in, temp_out);
for (j = 0; j < 16; ++j)
output[j + i * 16] = temp_out[j];
}
}
}
......@@ -1375,27 +1384,3 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
out[j + i * 32] = temp_out[j];
}
}
void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
int stride) {
if (tx_type == DCT_DCT)
vp9_fdct4x4(input, output, stride);
else
vp9_short_fht4x4(input, output, stride, tx_type);
}
void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
int stride) {
if (tx_type == DCT_DCT)
vp9_fdct8x8(input, output, stride);
else
vp9_short_fht8x8(input, output, stride, tx_type);
}
void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
int stride) {
if (tx_type == DCT_DCT)
vp9_fdct16x16(input, output, stride);
else
vp9_short_fht16x16(input, output, stride, tx_type);
}
/*
* Copyright (c) 2013 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_ENCODER_VP9_DCT_H_
#define VP9_ENCODER_VP9_DCT_H_
#ifdef __cplusplus
extern "C" {
#endif
void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output,
int stride);
void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output,
int stride);
void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output,
int stride);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VP9_ENCODER_VP9_DCT_H_
......@@ -19,7 +19,6 @@
#include "vp9/common/vp9_reconintra.h"
#include "vp9/common/vp9_systemdependent.h"
#include "vp9/encoder/vp9_dct.h"
#include "vp9/encoder/vp9_encodemb.h"
#include "vp9/encoder/vp9_quantize.h"
#include "vp9/encoder/vp9_rdopt.h"
......@@ -571,7 +570,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
if (!x->skip_recode) {
vp9_subtract_block(16, 16, src_diff, diff_stride,
src, p->src.stride, dst, pd->dst.stride);
vp9_fht16x16(tx_type, src_diff, coeff, diff_stride);
vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan_order->scan,
......@@ -591,7 +590,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
if (!x->skip_recode) {
vp9_subtract_block(8, 8, src_diff, diff_stride,
src, p->src.stride, dst, pd->dst.stride);
vp9_fht8x8(tx_type, src_diff, coeff, diff_stride);
vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff,
pd->dequant, p->zbin_extra, eob, scan_order->scan,
......@@ -617,7 +616,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
vp9_subtract_block(4, 4, src_diff, diff_stride,
src, p->src.stride, dst, pd->dst.stride);
if (tx_type != DCT_DCT)
vp9_short_fht4x4(src_diff, coeff, diff_stride, tx_type);
vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
else
x->fwd_txm4x4(src_diff, coeff, diff_stride);
vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
......
......@@ -1064,7 +1064,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
so = &vp9_scan_orders[TX_4X4][tx_type];
if (tx_type != DCT_DCT)
vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
vp9_fht4x4(src_diff, coeff, 8, tx_type);
else
x->fwd_txm4x4(src_diff, coeff, 8);
......
......@@ -244,32 +244,36 @@ void fadst4_avx2(__m128i *in) {
transpose_4x4_avx2(in);
}
void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
void vp9_fht4x4_avx2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in[4];
load_buffer_4x4_avx2(input, in, stride);
switch (tx_type) {
case 0: // DCT_DCT
fdct4_avx2(in);
fdct4_avx2(in);
case DCT_DCT:
vp9_fdct4x4_avx2(input, output, stride);
break;
case 1: // ADST_DCT
case ADST_DCT:
load_buffer_4x4_avx2(input, in, stride);
fadst4_avx2(in);
fdct4_avx2(in);
write_buffer_4x4_avx2(output, in);
break;
case 2: // DCT_ADST
case DCT_ADST:
load_buffer_4x4_avx2(input, in, stride);
fdct4_avx2(in);
fadst4_avx2(in);
write_buffer_4x4_avx2(output, in);
break;
case 3: // ADST_ADST
case ADST_ADST:
load_buffer_4x4_avx2(input, in, stride);
fadst4_avx2(in);
fadst4_avx2(in);
write_buffer_4x4_avx2(output, in);
break;
default:
assert(0);
break;
}
write_buffer_4x4_avx2(output, in);
}
void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
......@@ -1028,33 +1032,39 @@ void fadst8_avx2(__m128i *in) {
array_transpose_8x8_avx2(in, in);
}
void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
void vp9_fht8x8_avx2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in[8];
load_buffer_8x8_avx2(input, in, stride);
switch (tx_type) {
case 0: // DCT_DCT
fdct8_avx2(in);
fdct8_avx2(in);
case DCT_DCT:
vp9_fdct8x8_avx2(input, output, stride);
break;
case 1: // ADST_DCT
case ADST_DCT:
load_buffer_8x8_avx2(input, in, stride);
fadst8_avx2(in);
fdct8_avx2(in);
right_shift_8x8_avx2(in, 1);
write_buffer_8x8_avx2(output, in, 8);
break;
case 2: // DCT_ADST
case DCT_ADST:
load_buffer_8x8_avx2(input, in, stride);
fdct8_avx2(in);
fadst8_avx2(in);
right_shift_8x8_avx2(in, 1);
write_buffer_8x8_avx2(output, in, 8);
break;
case 3: // ADST_ADST
case ADST_ADST:
load_buffer_8x8_avx2(input, in, stride);
fadst8_avx2(in);
fadst8_avx2(in);
right_shift_8x8_avx2(in, 1);
write_buffer_8x8_avx2(output, in, 8);
break;
default:
assert(0);
break;
}
right_shift_8x8_avx2(in, 1);
write_buffer_8x8_avx2(output, in, 8);
}
void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
......@@ -2534,36 +2544,39 @@ void fadst16_avx2(__m128i *in0, __m128i *in1) {
array_transpose_16x16_avx2(in0, in1);
}
void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
void vp9_fht16x16_avx2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in0[16], in1[16];
load_buffer_16x16_avx2(input, in0, in1, stride);
switch (tx_type) {
case 0: // DCT_DCT
fdct16_avx2(in0, in1);
right_shift_16x16_avx2(in0, in1);
fdct16_avx2(in0, in1);
case DCT_DCT:
vp9_fdct16x16_avx2(input, output, stride);
break;
case 1: // ADST_DCT
case ADST_DCT:
load_buffer_16x16_avx2(input, in0, in1, stride);
fadst16_avx2(in0, in1);
right_shift_16x16_avx2(in0, in1);
fdct16_avx2(in0, in1);
write_buffer_16x16_avx2(output, in0, in1, 16);
break;
case 2: // DCT_ADST
case DCT_ADST:
load_buffer_16x16_avx2(input, in0, in1, stride);
fdct16_avx2(in0, in1);
right_shift_16x16_avx2(in0, in1);
fadst16_avx2(in0, in1);
write_buffer_16x16_avx2(output, in0, in1, 16);
break;
case 3: // ADST_ADST
case ADST_ADST:
load_buffer_16x16_avx2(input, in0, in1, stride);
fadst16_avx2(in0, in1);
right_shift_16x16_avx2(in0, in1);
fadst16_avx2(in0, in1);
write_buffer_16x16_avx2(output, in0, in1, 16);
break;
default:
assert(0);
break;
}
write_buffer_16x16_avx2(output, in0, in1, 16);
}
#define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
......
......@@ -242,32 +242,36 @@ void fadst4_sse2(__m128i *in) {
transpose_4x4(in);
}
void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in[4];
load_buffer_4x4(input, in, stride);
switch (tx_type) {
case 0: // DCT_DCT
fdct4_sse2(in);
fdct4_sse2(in);
case DCT_DCT:
vp9_fdct4x4_sse2(input, output, stride);
break;
case 1: // ADST_DCT
case ADST_DCT:
load_buffer_4x4(input, in, stride);
fadst4_sse2(in);
fdct4_sse2(in);
write_buffer_4x4(output, in);
break;
case 2: // DCT_ADST
case DCT_ADST:
load_buffer_4x4(input, in, stride);
fdct4_sse2(in);
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
case 3: // ADST_ADST
case ADST_ADST:
load_buffer_4x4(input, in, stride);
fadst4_sse2(in);
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
default:
assert(0);
break;
default:
assert(0);
break;
}
write_buffer_4x4(output, in);
}
void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
......@@ -1026,33 +1030,39 @@ void fadst8_sse2(__m128i *in) {
array_transpose_8x8(in, in);
}
void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in[8];
load_buffer_8x8(input, in, stride);
switch (tx_type) {
case 0: // DCT_DCT
fdct8_sse2(in);
fdct8_sse2(in);
case DCT_DCT:
vp9_fdct8x8_sse2(input, output, stride);
break;
case 1: // ADST_DCT
case ADST_DCT:
load_buffer_8x8(input, in, stride);
fadst8_sse2(in);
fdct8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case 2: // DCT_ADST
case DCT_ADST:
load_buffer_8x8(input, in, stride);
fdct8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
case 3: // ADST_ADST
case ADST_ADST:
load_buffer_8x8(input, in, stride);
fadst8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
default:
assert(0);
break;
}
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
}
void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
......@@ -2532,36 +2542,39 @@ void fadst16_sse2(__m128i *in0, __m128i *in1) {
array_transpose_16x16(in0, in1);
}
void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
int stride, int tx_type) {
__m128i in0[16], in1[16];
load_buffer_16x16(input, in0, in1, stride);
switch (tx_type) {
case 0: // DCT_DCT
fdct16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fdct16_sse2(in0, in1);
case DCT_DCT:
vp9_fdct16x16_sse2(input, output, stride);
break;
case 1: // ADST_DCT
case ADST_DCT:
load_buffer_16x16(input, in0, in1, stride);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fdct16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
case 2: // DCT_ADST
case DCT_ADST:
load_buffer_16x16(input, in0, in1, stride);
fdct16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;