Commit d5de63d2 authored by Linfeng Zhang's avatar Linfeng Zhang
Browse files

Update highbd idct functions arguments to use uint16_t dst

BUG=webm:1388

Change-Id: I3581d80d0389b99166e70987d38aba2db6c469d5
parent 081b39f2
......@@ -255,11 +255,11 @@ void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
#if CONFIG_VP9_HIGHBITDEPTH
void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct16x16_256_add_c(in, out, stride, 10);
vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct16x16_256_add_c(in, out, stride, 12);
vpx_highbd_idct16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
......@@ -273,36 +273,36 @@ void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
}
void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 10);
vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
}
void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
vp9_highbd_iht16x16_256_add_c(in, out, stride, tx_type, 12);
vp9_highbd_iht16x16_256_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
}
#if HAVE_SSE2
void idct16x16_10_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct16x16_10_add_c(in, out, stride, 10);
vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct16x16_10_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct16x16_10_add_c(in, out, stride, 12);
vpx_highbd_idct16x16_10_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void idct16x16_256_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 10);
vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct16x16_256_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct16x16_256_add_sse2(in, out, stride, 12);
vpx_highbd_idct16x16_256_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void idct16x16_10_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 10);
vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct16x16_10_add_sse2(in, out, stride, 12);
vpx_highbd_idct16x16_10_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
}
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
......
......@@ -71,11 +71,11 @@ typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
#if CONFIG_VP9_HIGHBITDEPTH
void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct32x32_1024_add_c(in, out, stride, 10);
vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct32x32_1024_add_c(in, out, stride, 12);
vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
......
......@@ -55,36 +55,36 @@ void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
#if CONFIG_VP9_HIGHBITDEPTH
void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct4x4_16_add_c(in, out, stride, 10);
vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct4x4_16_add_c(in, out, stride, 12);
vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 10);
vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
}
void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
vp9_highbd_iht4x4_16_add_c(in, out, stride, tx_type, 12);
vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
}
void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_iwht4x4_16_add_c(in, out, stride, 10);
vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_iwht4x4_16_add_c(in, out, stride, 12);
vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
#if HAVE_SSE2
void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 10);
vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct4x4_16_add_sse2(in, out, stride, 12);
vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
}
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
......
......@@ -88,45 +88,45 @@ void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
#if CONFIG_VP9_HIGHBITDEPTH
void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct8x8_64_add_c(in, out, stride, 10);
vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct8x8_64_add_c(in, out, stride, 12);
vpx_highbd_idct8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 10);
vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
}
void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
vp9_highbd_iht8x8_64_add_c(in, out, stride, tx_type, 12);
vp9_highbd_iht8x8_64_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
}
#if HAVE_SSE2
void idct8x8_12_add_10_c(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct8x8_12_add_c(in, out, stride, 10);
vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct8x8_12_add_12_c(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct8x8_12_add_c(in, out, stride, 12);
vpx_highbd_idct8x8_12_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void idct8x8_12_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 10);
vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct8x8_12_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct8x8_12_add_sse2(in, out, stride, 12);
vpx_highbd_idct8x8_12_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
}
void idct8x8_64_add_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 10);
vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
}
void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
vpx_highbd_idct8x8_64_add_sse2(in, out, stride, 12);
vpx_highbd_idct8x8_64_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
}
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
......
......@@ -43,9 +43,11 @@ void wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
}
#if CONFIG_VP9_HIGHBITDEPTH
template <InvTxfmWithBdFunc fn>
typedef void (*InvTxfmHighbdFunc)(const tran_low_t *in, uint16_t *out,
int stride, int bd);
template <InvTxfmHighbdFunc fn>
void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
fn(in, out, stride, bd);
fn(in, CAST_TO_SHORTPTR(out), stride, bd);
}
#endif
......
......@@ -205,7 +205,7 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int tx_type, int bd) {
const highbd_transform_2d IHT_4[] = {
{ vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0
......@@ -213,7 +213,6 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
{ vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2
{ vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3
};
uint16_t *dest = CAST_TO_SHORTPTR(dest8);
int i, j;
tran_low_t out[4 * 4];
......@@ -245,14 +244,13 @@ static const highbd_transform_2d HIGH_IHT_8[] = {
{ vpx_highbd_iadst8_c, vpx_highbd_iadst8_c } // ADST_ADST = 3
};
void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int tx_type, int bd) {
int i, j;
tran_low_t out[8 * 8];
tran_low_t *outptr = out;
tran_low_t temp_in[8], temp_out[8];
const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
uint16_t *dest = CAST_TO_SHORTPTR(dest8);
// Inverse transform row vectors.
for (i = 0; i < 8; ++i) {
......@@ -279,14 +277,13 @@ static const highbd_transform_2d HIGH_IHT_16[] = {
{ vpx_highbd_iadst16_c, vpx_highbd_iadst16_c } // ADST_ADST = 3
};
void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *dest,
int stride, int tx_type, int bd) {
int i, j;
tran_low_t out[16 * 16];
tran_low_t *outptr = out;
tran_low_t temp_in[16], temp_out[16];
const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
uint16_t *dest = CAST_TO_SHORTPTR(dest8);
// Rows
for (i = 0; i < 16; ++i) {
......@@ -307,7 +304,7 @@ void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
}
// idct
void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd) {
if (eob > 1)
vpx_highbd_idct4x4_16_add(input, dest, stride, bd);
......@@ -315,7 +312,7 @@ void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
vpx_highbd_idct4x4_1_add(input, dest, stride, bd);
}
void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd) {
if (eob > 1)
vpx_highbd_iwht4x4_16_add(input, dest, stride, bd);
......@@ -323,7 +320,7 @@ void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
vpx_highbd_iwht4x4_1_add(input, dest, stride, bd);
}
void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd) {
// If dc is 1, then input[0] is the reconstructed value, do not need
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
......@@ -340,7 +337,7 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
}
}
void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd) {
// The calculation can be simplified if there are not many non-zero dct
// coefficients. Use eobs to separate different cases.
......@@ -356,7 +353,7 @@ void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
}
}
void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd) {
// Non-zero coeff only in upper-left 8x8
if (eob == 1) {
......@@ -372,7 +369,7 @@ void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
// iht
void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd) {
uint16_t *dest, int stride, int eob, int bd) {
if (tx_type == DCT_DCT)
vp9_highbd_idct4x4_add(input, dest, stride, eob, bd);
else
......@@ -380,7 +377,7 @@ void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
}
void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd) {
uint16_t *dest, int stride, int eob, int bd) {
if (tx_type == DCT_DCT) {
vp9_highbd_idct8x8_add(input, dest, stride, eob, bd);
} else {
......@@ -389,7 +386,7 @@ void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
}
void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd) {
uint16_t *dest, int stride, int eob, int bd) {
if (tx_type == DCT_DCT) {
vp9_highbd_idct16x16_add(input, dest, stride, eob, bd);
} else {
......
......@@ -57,22 +57,22 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
int stride, int eob);
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_highbd_iwht4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
void vp9_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_highbd_idct4x4_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
void vp9_highbd_idct8x8_add(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
void vp9_highbd_idct16x16_add(const tran_low_t *input, uint8_t *dest,
void vp9_highbd_idct16x16_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd);
void vp9_highbd_idct32x32_add(const tran_low_t *input, uint8_t *dest,
void vp9_highbd_idct32x32_add(const tran_low_t *input, uint16_t *dest,
int stride, int eob, int bd);
void vp9_highbd_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd);
uint16_t *dest, int stride, int eob, int bd);
void vp9_highbd_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd);
uint16_t *dest, int stride, int eob, int bd);
void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
uint8_t *dest, int stride, int eob, int bd);
uint16_t *dest, int stride, int eob, int bd);
#endif // CONFIG_VP9_HIGHBITDEPTH
#ifdef __cplusplus
} // extern "C"
......
......@@ -101,11 +101,11 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd";
add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
}
#
......
......@@ -189,7 +189,7 @@ static void inverse_transform_block_inter(MACROBLOCKD *xd, int plane,
assert(eob > 0);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));
uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
if (xd->lossless) {
vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
} else {
......@@ -257,7 +257,7 @@ static void inverse_transform_block_intra(MACROBLOCKD *xd, int plane,
assert(eob > 0);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));
uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
if (xd->lossless) {
vp9_highbd_iwht4x4_add(dqcoeff, dst16, stride, eob, xd->bd);
} else {
......
......@@ -184,7 +184,7 @@ struct macroblock {
void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
#if CONFIG_VP9_HIGHBITDEPTH
void (*highbd_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
void (*highbd_itxm_add)(const tran_low_t *input, uint16_t *dest, int stride,
int eob, int bd);
#endif
};
......
......@@ -637,7 +637,7 @@ static void encode_block(int plane, int block, int row, int col,
if (x->skip_encode || p->eobs[block] == 0) return;
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));
uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
switch (tx_size) {
case TX_32X32:
vp9_highbd_idct32x32_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
......@@ -700,8 +700,8 @@ static void encode_block_pass1(int plane, int block, int row, int col,
if (p->eobs[block] > 0) {
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
x->highbd_itxm_add(dqcoeff, CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst)),
pd->dst.stride, p->eobs[block], xd->bd);
x->highbd_itxm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,
p->eobs[block], xd->bd);
return;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
......@@ -801,7 +801,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));
uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
switch (tx_size) {
case TX_32X32:
if (!x->skip_recode) {
......
......@@ -601,22 +601,21 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
32, NULL, 0, NULL, 0, bs, bs, xd->bd);
recon = CAST_TO_BYTEPTR(recon16);
if (xd->lossless) {
vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
} else {
switch (tx_size) {
case TX_4X4:
vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd);
vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
case TX_8X8:
vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd);
vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
case TX_16X16:
vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd);
vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
case TX_32X32:
vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd);
vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd);
break;
default: assert(0 && "Invalid transform size");
}
......@@ -1005,7 +1004,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
const int block = (row + idy) * 2 + (col + idx);
const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
uint8_t *const dst16 = CAST_TO_BYTEPTR(CONVERT_TO_SHORTPTR(dst));
uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
int16_t *const src_diff =
vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
......
......@@ -1268,10 +1268,8 @@ void vpx_highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
}
}
void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
uint16_t *dest = CAST_TO_SHORTPTR(dest8);
if (bd == 8) {
int16_t row_idct_output[16 * 16];
......@@ -1313,10 +1311,8 @@ void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8,
}
}
void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
uint16_t *dest = CAST_TO_SHORTPTR(dest8);
if (bd == 8) {
int16_t row_idct_output[16 * 16];
......@@ -1349,10 +1345,8 @@ void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8,
}
}
void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
uint16_t *dest = CAST_TO_SHORTPTR(dest8);
if (bd == 8) {
int16_t row_idct_output[4 * 16];
......@@ -1414,7 +1408,7 @@ static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
*dest += stride;
}
void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
......@@ -1422,7 +1416,6 @@ void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
const int16x8_t dc = vdupq_n_s16(a1);
uint16_t *dest = CAST_TO_SHORTPTR(dest8);
int i;
if (a1 >= 0) {
......
......@@ -386,15 +386,14 @@ static INLINE void idct32_bands_end_2nd_pass(const int32_t *const out,
}
static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
uint8_t *const dest,
const int stride, const int bd) {
uint16_t *dst, const int stride,
const int bd) {
int i, idct32_pass_loop;
int32_t trans_buf[32 * 8];
int32_t pass1[32 * 32];
int32_t pass2[32 * 32];
int32_t *out;
int32x4x2_t q[16];
uint16_t *dst = CAST_TO_SHORTPTR(dest);
for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2;
idct32_pass_loop++, input = pass1, out = pass2) {
......@@ -637,10 +636,10 @@ static INLINE void vpx_highbd_idct32_32_neon(const tran_low_t *input,
}
}
void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
if (bd == 8) {
vpx_idct32_32_neon(input, dest, stride, 1);
vpx_idct32_32_neon(input, CAST_TO_BYTEPTR(dest), stride, 1);
} else {
vpx_highbd_idct32_32_neon(input, dest, stride, bd);
}
......
......@@ -726,10 +726,9 @@ static void vpx_highbd_idct32_16_neon(const int32_t *const input,
highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
}
void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i;
uint16_t *dest = CAST_TO_SHORTPTR(dest8);
if (bd == 8) {
int16_t temp[32 * 16];
......
......@@ -594,10 +594,9 @@ static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output,
highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd);
}
void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
int i;
uint16_t *dest = CAST_TO_SHORTPTR(dest8);
if (bd == 8) {
int16_t temp[32 * 8];
......
......@@ -59,7 +59,7 @@ static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
*dest += stride;
}
void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
......@@ -67,7 +67,6 @@ void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,
HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
const int16x8_t dc = vdupq_n_s16(a1);
uint16_t *dest = CAST_TO_SHORTPTR(dest8);
int i;
if (a1 >= 0) {
......
......@@ -51,7 +51,7 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
*dest += stride;
}
void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8,
void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
const int16x8_t max = vdupq_n_s16((1 << bd) - 1);