Commit 641fda79 authored by Johann's avatar Johann

highbd x86: consolidate tran_low_t conversions

Create new helper files specifically for converting tran_low_t types.

Change-Id: I7c4c458ef910f3b3d10a3cfbf9df4de7682fd905
parent a16ca80b
......@@ -149,6 +149,7 @@ CODEC_SRCS-yes += $(BUILD_PFX)vpx_config.c
INSTALL-SRCS-no += $(BUILD_PFX)vpx_config.c
ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += third_party/x86inc/x86inc.asm
INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm
endif
CODEC_EXPORTS-yes += vpx/exports_com
CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
......@@ -204,6 +205,7 @@ ASM_INCLUDES := \
third_party/x86inc/x86inc.asm \
vpx_config.asm \
vpx_ports/x86_abi_support.asm \
vpx_dsp/x86/bitdepth_conversion_sse2.asm \
vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
@echo " [CREATE] $@"
......
......@@ -11,6 +11,7 @@
%define private_prefix vp9
%include "third_party/x86inc/x86inc.asm"
%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
......@@ -62,25 +63,7 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride
psllw m0, 2
psllw m1, 2
%if CONFIG_VP9_HIGHBITDEPTH
; sign extension
mova m2, m0
mova m3, m1
punpcklwd m0, m0
punpcklwd m1, m1
punpckhwd m2, m2
punpckhwd m3, m3
psrad m0, 16
psrad m1, 16
psrad m2, 16
psrad m3, 16
mova [outputq], m0
mova [outputq + 16], m2
mova [outputq + 32], m1
mova [outputq + 48], m3
%else
mova [outputq], m0
mova [outputq + 16], m1
%endif
STORE_TRAN_LOW 0, outputq, 0, 2, 3
STORE_TRAN_LOW 1, outputq, 1, 2, 3
RET
......@@ -14,7 +14,7 @@
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/x86/fdct.h"
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
......
......@@ -13,6 +13,12 @@ DSP_SRCS-yes += vpx_dsp_common.h
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2.h
# This file is included in libs.mk. Including it here would cause it to be
# compiled into an object. Even as an empty file, this would create an
# executable section on the stack.
#DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2$(ASM)
# bit reader
DSP_SRCS-yes += prob.h
DSP_SRCS-yes += prob.c
......@@ -245,7 +251,6 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
DSP_SRCS-yes += quantize.c
DSP_SRCS-yes += quantize.h
DSP_SRCS-$(HAVE_SSE2) += x86/fdct.h
DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
......
......@@ -12,7 +12,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/x86/fdct.h"
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_ports/mem.h"
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
......
......@@ -9,6 +9,7 @@
;
%include "third_party/x86inc/x86inc.asm"
%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
......@@ -94,20 +95,6 @@ SECTION .text
SWAP 7, 9
%endmacro
%if CONFIG_VP9_HIGHBITDEPTH
; store %1 to outputq + %2
; uses m8-m10 as scratch registers
%macro STORE_TRAN_LOW 2
pxor m8, m8
mova m9, m%1
mova m10, m%1
pcmpgtw m8, m%1
punpcklwd m9, m8
punpckhwd m10, m8
mova [outputq + %2], m9
mova [outputq + %2 + 16], m10
%endmacro
%endif
INIT_XMM ssse3
cglobal hadamard_8x8, 3, 5, 11, input, stride, output
......@@ -130,25 +117,14 @@ cglobal hadamard_8x8, 3, 5, 11, input, stride, output
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
HMD8_1D
%if CONFIG_VP9_HIGHBITDEPTH
STORE_TRAN_LOW 0, 0
STORE_TRAN_LOW 1, 32
STORE_TRAN_LOW 2, 64
STORE_TRAN_LOW 3, 96
STORE_TRAN_LOW 4, 128
STORE_TRAN_LOW 5, 160
STORE_TRAN_LOW 6, 192
STORE_TRAN_LOW 7, 224
%else
mova [outputq + 0], m0
mova [outputq + 16], m1
mova [outputq + 32], m2
mova [outputq + 48], m3
mova [outputq + 64], m4
mova [outputq + 80], m5
mova [outputq + 96], m6
mova [outputq + 112], m7
%endif
STORE_TRAN_LOW 0, outputq, 0, 8, 9
STORE_TRAN_LOW 1, outputq, 1, 8, 9
STORE_TRAN_LOW 2, outputq, 2, 8, 9
STORE_TRAN_LOW 3, outputq, 3, 8, 9
STORE_TRAN_LOW 4, outputq, 4, 8, 9
STORE_TRAN_LOW 5, outputq, 5, 8, 9
STORE_TRAN_LOW 6, outputq, 6, 8, 9
STORE_TRAN_LOW 7, outputq, 7, 8, 9
RET
%endif
;
; Copyright (c) 2017 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm.
; vpx_config.asm is not guarded so can not be included twice. Because this will
; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be
; included after those files.
; Increment register by sizeof() tran_low_t * 8.
%macro INCREMENT_TRAN_LOW 1
%if CONFIG_VP9_HIGHBITDEPTH
add %1, 32
%else
add %1, 16
%endif
%endmacro
; Increment %1 by sizeof() tran_low_t * %2.
%macro INCREMENT_ELEMENTS_TRAN_LOW 2
%if CONFIG_VP9_HIGHBITDEPTH
lea %1, [%1 + %2 * 4]
%else
lea %1, [%1 + %2 * 2]
%endif
%endmacro
; Load %2 + %3 into m%1.
; %3 is the offset in elements, not bits.
; If tran_low_t is 16 bits (low bit depth configuration) then load the value
; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
; the values down to 16 bits.
%macro LOAD_TRAN_LOW 3
%if CONFIG_VP9_HIGHBITDEPTH
mova m%1, [%2 + %3 * 32]
packssdw m%1, [%2 + %3 * 32 + 16]
%else
mova m%1, [%2 + %3 * 16]
%endif
%endmacro
; Store m%1 to %2 + %3.
; %3 is the offset in elements, not bits.
; If tran_low_t is 16 bits (low bit depth configuration) then store the value
; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
; extend the values first.
; Uses m%4-m%6 as scratch registers for high bit depth.
%macro STORE_TRAN_LOW 5
%if CONFIG_VP9_HIGHBITDEPTH
pxor m%4, m%4
mova m%5, m%1
pcmpgtw m%4, m%1
punpcklwd m%5, m%4
punpckhwd m%1, m%4
mova [%2 + %3 * 32 + 0], m%5
mova [%2 + %3 * 32 + 16], m%1
%else
mova [%2 + %3 * 16], m%1
%endif
%endmacro
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
* Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
......
......@@ -9,6 +9,7 @@
;
%include "third_party/x86inc/x86inc.asm"
%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION_RODATA
......@@ -230,21 +231,10 @@ cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
lea r3, [2 * strideq]
%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [inputq + 0]
packssdw m0, [inputq + 16]
mova m1, [inputq + 32]
packssdw m1, [inputq + 48]
mova m2, [inputq + 64]
packssdw m2, [inputq + 80]
mova m3, [inputq + 96]
packssdw m3, [inputq + 112]
%else
mova m0, [inputq + 0]
mova m1, [inputq + 16]
mova m2, [inputq + 32]
mova m3, [inputq + 48]
%endif
LOAD_TRAN_LOW 0, inputq, 0
LOAD_TRAN_LOW 1, inputq, 1
LOAD_TRAN_LOW 2, inputq, 2
LOAD_TRAN_LOW 3, inputq, 3
punpcklwd m0, m1
punpcklwd m2, m3
......@@ -752,33 +742,14 @@ idct32x32_34:
lea r4, [rsp + transposed_in]
idct32x32_34_transpose:
%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [r3 + 0]
packssdw m0, [r3 + 16]
mova m1, [r3 + 32 * 4]
packssdw m1, [r3 + 32 * 4 + 16]
mova m2, [r3 + 32 * 8]
packssdw m2, [r3 + 32 * 8 + 16]
mova m3, [r3 + 32 * 12]
packssdw m3, [r3 + 32 * 12 + 16]
mova m4, [r3 + 32 * 16]
packssdw m4, [r3 + 32 * 16 + 16]
mova m5, [r3 + 32 * 20]
packssdw m5, [r3 + 32 * 20 + 16]
mova m6, [r3 + 32 * 24]
packssdw m6, [r3 + 32 * 24 + 16]
mova m7, [r3 + 32 * 28]
packssdw m7, [r3 + 32 * 28 + 16]
%else
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
mova m3, [r3 + 16 * 12]
mova m4, [r3 + 16 * 16]
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
%endif
LOAD_TRAN_LOW 0, r3, 0
LOAD_TRAN_LOW 1, r3, 4
LOAD_TRAN_LOW 2, r3, 8
LOAD_TRAN_LOW 3, r3, 12
LOAD_TRAN_LOW 4, r3, 16
LOAD_TRAN_LOW 5, r3, 20
LOAD_TRAN_LOW 6, r3, 24
LOAD_TRAN_LOW 7, r3, 28
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
......@@ -1182,33 +1153,15 @@ idct32x32_135:
mov r7, 2
idct32x32_135_transpose:
%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [r3 + 0]
packssdw m0, [r3 + 16]
mova m1, [r3 + 32 * 4]
packssdw m1, [r3 + 32 * 4 + 16]
mova m2, [r3 + 32 * 8]
packssdw m2, [r3 + 32 * 8 + 16]
mova m3, [r3 + 32 * 12]
packssdw m3, [r3 + 32 * 12 + 16]
mova m4, [r3 + 32 * 16]
packssdw m4, [r3 + 32 * 16 + 16]
mova m5, [r3 + 32 * 20]
packssdw m5, [r3 + 32 * 20 + 16]
mova m6, [r3 + 32 * 24]
packssdw m6, [r3 + 32 * 24 + 16]
mova m7, [r3 + 32 * 28]
packssdw m7, [r3 + 32 * 28 + 16]
%else
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
mova m3, [r3 + 16 * 12]
mova m4, [r3 + 16 * 16]
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
%endif
LOAD_TRAN_LOW 0, r3, 0
LOAD_TRAN_LOW 1, r3, 4
LOAD_TRAN_LOW 2, r3, 8
LOAD_TRAN_LOW 3, r3, 12
LOAD_TRAN_LOW 4, r3, 16
LOAD_TRAN_LOW 5, r3, 20
LOAD_TRAN_LOW 6, r3, 24
LOAD_TRAN_LOW 7, r3, 28
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
mova [r4 + 0], m0
......@@ -1220,22 +1173,14 @@ idct32x32_135_transpose:
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
%if CONFIG_VP9_HIGHBITDEPTH
add r3, 32
%else
add r3, 16
%endif
INCREMENT_TRAN_LOW r3
add r4, 16 * 8
dec r7
jne idct32x32_135_transpose
IDCT32X32_135 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
%if CONFIG_VP9_HIGHBITDEPTH
lea inputq, [inputq + 32 * 32]
%else
lea inputq, [inputq + 16 * 32]
%endif
INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32
dec r6
jnz idct32x32_135
......@@ -1646,33 +1591,14 @@ idct32x32_1024:
mov r7, 4
idct32x32_1024_transpose:
%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [r3 + 0]
packssdw m0, [r3 + 16]
mova m1, [r3 + 32 * 4]
packssdw m1, [r3 + 32 * 4 + 16]
mova m2, [r3 + 32 * 8]
packssdw m2, [r3 + 32 * 8 + 16]
mova m3, [r3 + 32 * 12]
packssdw m3, [r3 + 32 * 12 + 16]
mova m4, [r3 + 32 * 16]
packssdw m4, [r3 + 32 * 16 + 16]
mova m5, [r3 + 32 * 20]
packssdw m5, [r3 + 32 * 20 + 16]
mova m6, [r3 + 32 * 24]
packssdw m6, [r3 + 32 * 24 + 16]
mova m7, [r3 + 32 * 28]
packssdw m7, [r3 + 32 * 28 + 16]
%else
mova m0, [r3 + 0]
mova m1, [r3 + 16 * 4]
mova m2, [r3 + 16 * 8]
mova m3, [r3 + 16 * 12]
mova m4, [r3 + 16 * 16]
mova m5, [r3 + 16 * 20]
mova m6, [r3 + 16 * 24]
mova m7, [r3 + 16 * 28]
%endif
LOAD_TRAN_LOW 0, r3, 0
LOAD_TRAN_LOW 1, r3, 4
LOAD_TRAN_LOW 2, r3, 8
LOAD_TRAN_LOW 3, r3, 12
LOAD_TRAN_LOW 4, r3, 16
LOAD_TRAN_LOW 5, r3, 20
LOAD_TRAN_LOW 6, r3, 24
LOAD_TRAN_LOW 7, r3, 28
TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
......@@ -1684,11 +1610,7 @@ idct32x32_1024_transpose:
mova [r4 + 16 * 5], m5
mova [r4 + 16 * 6], m6
mova [r4 + 16 * 7], m7
%if CONFIG_VP9_HIGHBITDEPTH
add r3, 32
%else
add r3, 16
%endif
INCREMENT_TRAN_LOW r3
add r4, 16 * 8
dec r7
jne idct32x32_1024_transpose
......@@ -1696,11 +1618,7 @@ idct32x32_1024_transpose:
IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
lea stp, [stp + 16 * 8]
%if CONFIG_VP9_HIGHBITDEPTH
lea inputq, [inputq + 32 * 32]
%else
lea inputq, [inputq + 16 * 32]
%endif
INCREMENT_ELEMENTS_TRAN_LOW inputq, 8*32
dec r6
jnz idct32x32_1024
......
......@@ -9,6 +9,7 @@
;
%include "third_party/x86inc/x86inc.asm"
%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
......@@ -82,15 +83,8 @@ SECTION .text
INIT_XMM sse2
cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
%if CONFIG_VP9_HIGHBITDEPTH
mova m0, [inputq + 0]
packssdw m0, [inputq + 16]
mova m1, [inputq + 32]
packssdw m1, [inputq + 48]
%else
mova m0, [inputq + 0]
mova m1, [inputq + 16]
%endif
LOAD_TRAN_LOW 0, inputq, 0
LOAD_TRAN_LOW 1, inputq, 1
psraw m0, 2
psraw m1, 2
......
......@@ -13,7 +13,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/x86/fdct.h"
#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t *zbin_ptr,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment