diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index dab88a3b62b8ea6aa651cc699b4b18e3198b8a76..87628659b7706a7865a96d7ebde5001e1e08399c 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -139,6 +139,20 @@ specialize vp9_intra8x8_predict; prototype void vp9_intra_uv4x4_predict "struct macroblockd *xd, struct blockd *x, int b_mode, uint8_t *predictor" specialize vp9_intra_uv4x4_predict; +if [ "$CONFIG_VP9_DECODER" = "yes" ]; then +prototype void vp9_add_residual_4x4 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_residual_4x4 sse2 + +prototype void vp9_add_residual_8x8 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_residual_8x8 sse2 + +prototype void vp9_add_residual_16x16 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_residual_16x16 sse2 + +prototype void vp9_add_residual_32x32 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_residual_32x32 sse2 +fi + # # Loopfilter # diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c index eaf98600929b10642b82ade175103445635aabd7..dade2aff56b5f7694ff78af7a0b1e46dc48c76b2 100644 --- a/vp9/decoder/vp9_dequantize.c +++ b/vp9/decoder/vp9_dequantize.c @@ -15,6 +15,7 @@ #include "vp9/decoder/vp9_onyxd_int.h" #include "vp9/common/vp9_common.h" + static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride, int width, int height) { int r, c; @@ -29,6 +30,26 @@ static void add_residual(const int16_t *diff, const uint8_t *pred, int pitch, } } +void vp9_add_residual_4x4_c(const int16_t *diff, const uint8_t *pred, int pitch, + uint8_t *dest, int stride) { + add_residual(diff, pred, pitch, dest, stride, 4, 4); +} + +void vp9_add_residual_8x8_c(const int16_t *diff, const uint8_t *pred, int pitch, + uint8_t *dest, int stride) { + add_residual(diff, pred, pitch, dest, stride, 8, 8); +} + +void vp9_add_residual_16x16_c(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + add_residual(diff, pred, pitch, dest, stride, 16, 16); +} + +void vp9_add_residual_32x32_c(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + add_residual(diff, pred, pitch, dest, stride, 32, 32); +} + static void add_constant_residual(const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride, int width, int height) { @@ -55,7 +76,7 @@ void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, vp9_short_iht4x4(input, output, 4, tx_type); vpx_memset(input, 0, 32); - add_residual(output, pred, pitch, dest, stride, 4, 4); + vp9_add_residual_4x4(output, pred, pitch, dest, stride); } void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, @@ -76,7 +97,7 @@ void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, vp9_short_iht8x8(input, output, 8, tx_type); vpx_memset(input, 0, 128); - add_residual(output, pred, pitch, dest, stride, 8, 8); + vp9_add_residual_8x8(output, pred, pitch, dest, stride); } } @@ -94,7 +115,7 @@ void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred, vpx_memset(input, 0, 32); - add_residual(output, pred, pitch, dest, stride, 4, 4); + vp9_add_residual_4x4(output, pred, pitch, dest, stride); } else { vp9_dc_only_idct_add(input[0]*dq[0], pred, dest, pitch, stride); ((int *)input)[0] = 0; @@ -114,7 +135,7 @@ void vp9_dequant_dc_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred, // the idct halves ( >> 1) the pitch vp9_short_idct4x4llm(input, output, 4 << 1); vpx_memset(input, 0, 32); - add_residual(output, pred, pitch, dest, stride, 4, 4); + vp9_add_residual_4x4(output, pred, pitch, dest, stride); } void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq, @@ -131,7 +152,7 @@ void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq, vpx_memset(input, 0, 32); - add_residual(output, pred, pitch, dest, stride, 4, 4); + vp9_add_residual_4x4(output, pred, pitch, dest, stride); } else { vp9_dc_only_inv_walsh_add(input[0]*dq[0], pred, dest, pitch, stride); ((int *)input)[0] = 0; @@ -152,7 +173,7 @@ void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq, vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1); vpx_memset(input, 0, 32); - add_residual(output, pred, pitch, dest, stride, 4, 4); + vp9_add_residual_4x4(output, pred, pitch, dest, stride); } void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq, @@ -201,7 +222,7 @@ void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq, input[16] = input[17] = 0; input[24] = 0; - add_residual(output, pred, pitch, dest, stride, 8, 8); + vp9_add_residual_8x8(output, pred, pitch, dest, stride); } else { int i; @@ -212,7 +233,7 @@ void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq, // the idct halves ( >> 1) the pitch vp9_short_idct8x8_c(input, output, 8 << 1); vpx_memset(input, 0, 128); - add_residual(output, pred, pitch, dest, stride, 8, 8); + vp9_add_residual_8x8(output, pred, pitch, dest, stride); } } @@ -242,7 +263,7 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input, vpx_memset(input, 0, 512); - add_residual(output, pred, pitch, dest, stride, 16, 16); + vp9_add_residual_16x16(output, pred, pitch, dest, stride); } } @@ -287,7 +308,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq, input[32] = input[33] = 0; input[48] = 0; - add_residual(output, pred, pitch, dest, stride, 16, 16); + vp9_add_residual_16x16(output, pred, pitch, dest, stride); } else { int i; @@ -302,7 +323,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq, vpx_memset(input, 0, 512); - add_residual(output, pred, pitch, dest, stride, 16, 16); + vp9_add_residual_16x16(output, pred, pitch, dest, stride); } } @@ -336,14 +357,14 @@ void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq, input[64] = input[65] = 0; input[96] = 0; - add_residual(output, pred, pitch, dest, stride, 32, 32); + vp9_add_residual_32x32(output, pred, pitch, dest, stride); } else { int i; for (i = 1; i < 1024; i++) input[i] = input[i] * dq[1] / 2; vp9_short_idct32x32(input, output, 64); vpx_memset(input, 0, 2048); - add_residual(output, pred, pitch, dest, stride, 32, 32); + vp9_add_residual_32x32(output, pred, pitch, dest, stride); } } } diff --git a/vp9/decoder/x86/vp9_dequantize_x86.c b/vp9/decoder/x86/vp9_dequantize_x86.c new file mode 100644 index 0000000000000000000000000000000000000000..0001de4eeeaa029eab15d697864b5ec039b96c47 --- /dev/null +++ b/vp9/decoder/x86/vp9_dequantize_x86.c @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include <mmintrin.h> // SSE +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +#if HAVE_SSE2 + +void vp9_add_residual_4x4_sse2(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + const int width = 4; + const __m128i zero = _mm_setzero_si128(); + + // Diff data + const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width)); + const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width)); + const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width)); + const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width)); + + // Prediction data. + __m128i p0 = _mm_cvtsi32_si128(*(const int *)(pred + 0 * pitch)); + __m128i p1 = _mm_cvtsi32_si128(*(const int *)(pred + 1 * pitch)); + __m128i p2 = _mm_cvtsi32_si128(*(const int *)(pred + 2 * pitch)); + __m128i p3 = _mm_cvtsi32_si128(*(const int *)(pred + 3 * pitch)); + + p0 = _mm_unpacklo_epi8(p0, zero); + p1 = _mm_unpacklo_epi8(p1, zero); + p2 = _mm_unpacklo_epi8(p2, zero); + p3 = _mm_unpacklo_epi8(p3, zero); + + p0 = _mm_add_epi16(p0, d0); + p1 = _mm_add_epi16(p1, d1); + p2 = _mm_add_epi16(p2, d2); + p3 = _mm_add_epi16(p3, d3); + + p0 = _mm_packus_epi16(p0, p1); + p2 = _mm_packus_epi16(p2, p3); + + *(int *)dest = _mm_cvtsi128_si32(p0); + dest += stride; + + p0 = _mm_srli_si128(p0, 8); + *(int *)dest = _mm_cvtsi128_si32(p0); + dest += stride; + + *(int *)dest = _mm_cvtsi128_si32(p2); + dest += stride; + + p2 = _mm_srli_si128(p2, 8); + *(int *)dest = _mm_cvtsi128_si32(p2); +} + +void vp9_add_residual_8x8_sse2(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + const int width = 8; + const __m128i zero = _mm_setzero_si128(); + + // Diff data + const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); + const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width)); + const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width)); + const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width)); + const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width)); + const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width)); + const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width)); + const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width)); + + // Prediction data. + __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch)); + __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch)); + __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch)); + __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch)); + __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch)); + __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch)); + __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch)); + __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch)); + + p0 = _mm_unpacklo_epi8(p0, zero); + p1 = _mm_unpacklo_epi8(p1, zero); + p2 = _mm_unpacklo_epi8(p2, zero); + p3 = _mm_unpacklo_epi8(p3, zero); + p4 = _mm_unpacklo_epi8(p4, zero); + p5 = _mm_unpacklo_epi8(p5, zero); + p6 = _mm_unpacklo_epi8(p6, zero); + p7 = _mm_unpacklo_epi8(p7, zero); + + p0 = _mm_add_epi16(p0, d0); + p1 = _mm_add_epi16(p1, d1); + p2 = _mm_add_epi16(p2, d2); + p3 = _mm_add_epi16(p3, d3); + p4 = _mm_add_epi16(p4, d4); + p5 = _mm_add_epi16(p5, d5); + p6 = _mm_add_epi16(p6, d6); + p7 = _mm_add_epi16(p7, d7); + + p0 = _mm_packus_epi16(p0, p1); + p2 = _mm_packus_epi16(p2, p3); + p4 = _mm_packus_epi16(p4, p5); + p6 = _mm_packus_epi16(p6, p7); + + // SSE + _mm_storel_pi((__m64 *)(dest + 0 * stride), (__m128)p0); + _mm_storeh_pi((__m64 *)(dest + 1 * stride), (__m128)p0); + + _mm_storel_pi((__m64 *)(dest + 2 * stride), (__m128)p2); + _mm_storeh_pi((__m64 *)(dest + 3 * stride), (__m128)p2); + + _mm_storel_pi((__m64 *)(dest + 4 * stride), (__m128)p4); + _mm_storeh_pi((__m64 *)(dest + 5 * stride), (__m128)p4); + + _mm_storel_pi((__m64 *)(dest + 6 * stride), (__m128)p6); + _mm_storeh_pi((__m64 *)(dest + 7 * stride), (__m128)p6); +} + +void vp9_add_residual_16x16_sse2(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + const int width = 16; + int i = 4; + const __m128i zero = _mm_setzero_si128(); + + // Diff data + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i p0, p1, p2, p3, p4, p5, p6, p7; + + do { + d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); + d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8)); + d2 = _mm_load_si128((const __m128i *)(diff + 1 * width)); + d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8)); + d4 = _mm_load_si128((const __m128i *)(diff + 2 * width)); + d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8)); + d6 = _mm_load_si128((const __m128i *)(diff + 3 * width)); + d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8)); + + // Prediction data. + p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch)); + p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch)); + p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch)); + p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch)); + + p0 = _mm_unpacklo_epi8(p1, zero); + p1 = _mm_unpackhi_epi8(p1, zero); + p2 = _mm_unpacklo_epi8(p3, zero); + p3 = _mm_unpackhi_epi8(p3, zero); + p4 = _mm_unpacklo_epi8(p5, zero); + p5 = _mm_unpackhi_epi8(p5, zero); + p6 = _mm_unpacklo_epi8(p7, zero); + p7 = _mm_unpackhi_epi8(p7, zero); + + p0 = _mm_add_epi16(p0, d0); + p1 = _mm_add_epi16(p1, d1); + p2 = _mm_add_epi16(p2, d2); + p3 = _mm_add_epi16(p3, d3); + p4 = _mm_add_epi16(p4, d4); + p5 = _mm_add_epi16(p5, d5); + p6 = _mm_add_epi16(p6, d6); + p7 = _mm_add_epi16(p7, d7); + + p0 = _mm_packus_epi16(p0, p1); + p1 = _mm_packus_epi16(p2, p3); + p2 = _mm_packus_epi16(p4, p5); + p3 = _mm_packus_epi16(p6, p7); + + _mm_store_si128((__m128i *)(dest + 0 * stride), p0); + _mm_store_si128((__m128i *)(dest + 1 * stride), p1); + _mm_store_si128((__m128i *)(dest + 2 * stride), p2); + _mm_store_si128((__m128i *)(dest + 3 * stride), p3); + + diff += 4 * width; + pred += 4 * pitch; + dest += 4 * stride; + } while (--i); +} + +void vp9_add_residual_32x32_sse2(const int16_t *diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + const int width = 32; + int i = 16; + const __m128i zero = _mm_setzero_si128(); + + // Diff data + __m128i d0, d1, d2, d3, d4, d5, d6, d7; + __m128i p0, p1, p2, p3, p4, p5, p6, p7; + + do { + d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); + d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8)); + d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16)); + d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24)); + d4 = _mm_load_si128((const __m128i *)(diff + 1 * width)); + d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8)); + d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16)); + d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24)); + + // Prediction data. + p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch)); + p3 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16)); + p5 = _mm_load_si128((const __m128i *)(pred + 1 * pitch)); + p7 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16)); + + p0 = _mm_unpacklo_epi8(p1, zero); + p1 = _mm_unpackhi_epi8(p1, zero); + p2 = _mm_unpacklo_epi8(p3, zero); + p3 = _mm_unpackhi_epi8(p3, zero); + p4 = _mm_unpacklo_epi8(p5, zero); + p5 = _mm_unpackhi_epi8(p5, zero); + p6 = _mm_unpacklo_epi8(p7, zero); + p7 = _mm_unpackhi_epi8(p7, zero); + + p0 = _mm_add_epi16(p0, d0); + p1 = _mm_add_epi16(p1, d1); + p2 = _mm_add_epi16(p2, d2); + p3 = _mm_add_epi16(p3, d3); + p4 = _mm_add_epi16(p4, d4); + p5 = _mm_add_epi16(p5, d5); + p6 = _mm_add_epi16(p6, d6); + p7 = _mm_add_epi16(p7, d7); + + p0 = _mm_packus_epi16(p0, p1); + p1 = _mm_packus_epi16(p2, p3); + p2 = _mm_packus_epi16(p4, p5); + p3 = _mm_packus_epi16(p6, p7); + + _mm_store_si128((__m128i *)(dest + 0 * stride), p0); + _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1); + _mm_store_si128((__m128i *)(dest + 1 * stride), p2); + _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3); + + diff += 2 * width; + pred += 2 * pitch; + dest += 2 * stride; + } while (--i); +} +#endif diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk index 7622fc0b23ca49a386233d67142b32a7d08be0bd..239ae30b693f179d1f8d50a6ccce7dd53087fe49 100644 --- a/vp9/vp9dx.mk +++ b/vp9/vp9dx.mk @@ -38,5 +38,11 @@ VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes)) VP9_DX_SRCS-$(HAVE_SSE2) += decoder/x86/vp9_idct_blk_sse2.c +VP9_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/vp9_dequantize_x86.c +ifeq ($(HAVE_SSE2),yes) +vp9/decoder/x86/vp9_dequantize_x86.c.o: CFLAGS += -msse2 +vp9/decoder/x86/vp9_dequantize_x86.c.d: CFLAGS += -msse2 +endif + $(eval $(call asm_offsets_template,\ vp9_asm_dec_offsets.asm, $(VP9_PREFIX)decoder/vp9_asm_dec_offsets.c))