Commit 7a07eea1 authored by John Koleszar's avatar John Koleszar

Convert subpixel filters to use convolve framework

Update the code to call the new convolution functions to do subpixel
prediction rather than the existing functions. Remove the old C and
assembly code, since it is unused. This causes a 50% performance
reduction on the decoder, but that will be resolved when the asm for
the new functions is available.

There is no consensus for whether 6-tap or 2-tap predictors will be
supported in the final codec, so these filters are implemented in
terms of the 8-tap code, so that quality testing of these modes
can continue. Implementing the lower complexity algorithms is a
simple exercise, should it be necessary.

This code produces slightly better results in the EIGHTTAP_SMOOTH
case, since the filter is now applied in only one direction when
the subpel motion is only in one direction. Like the previous code,
the filtering is skipped entirely on full-pel MVs. This combination
seems to give the best quality gains, but this may be indicative of a
bug in the encoder's filter selection, since the encoder could
achieve the result of skipping the filtering on full-pel by selecting
one of the other filters. This should be revisited.

Quality gains on derf positive on almost all clips. The only clip
that seemed to be hurt at all datarates was football
(-0.115% PSNR average, -0.587% min). Overall averages 0.375% PSNR,
0.347% SSIM.

Change-Id: I7d469716091b1d89b4b08adde5863999319d69ff
parent 5ca6a366
......@@ -11,8 +11,6 @@
#include "./vpx_config.h"
#include "vp9_rtcd.h"
#include "vp9/common/vp9_subpixel.h"
#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_onyxc_int.h"
void vp9_machine_specific_config(VP9_COMMON *ctx) {
......
......@@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp9/common/vp9_subpixel.h"
#include "vp9/common/vp9_loopfilter.h"
#include "recon.h"
#include "vp9/common/vp9_onyxc_int.h"
......
......@@ -16,9 +16,9 @@ void vpx_log(const char *format, ...);
#include "./vpx_config.h"
#include "vpx_scale/yv12config.h"
#include "vp9/common/vp9_convolve.h"
#include "vp9/common/vp9_mv.h"
#include "vp9/common/vp9_treecoder.h"
#include "vp9/common/vp9_subpixel.h"
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_common.h"
......@@ -393,15 +393,8 @@ typedef struct macroblockd {
void (*inv_walsh4x4_1)(int16_t *in, int16_t *out);
void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out);
struct subpix_fn_table subpix;
vp9_subpix_fn_t subpixel_predict4x4;
vp9_subpix_fn_t subpixel_predict8x4;
vp9_subpix_fn_t subpixel_predict8x8;
vp9_subpix_fn_t subpixel_predict16x16;
vp9_subpix_fn_t subpixel_predict_avg4x4;
vp9_subpix_fn_t subpixel_predict_avg8x4;
vp9_subpix_fn_t subpixel_predict_avg8x8;
vp9_subpix_fn_t subpixel_predict_avg16x16;
int allow_high_precision_mv;
int corrupted;
......
......@@ -297,3 +297,49 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
filter_x, x_step_q4, filter_y, y_step_q4,
w, h, 8);
}
void vp9_convolve_copy(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h) {
if (h == 16) {
vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
} else if (h == 8) {
vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
} else if (w == 8) {
vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
} else {
// 4x4
int r;
for (r = 0; r < 4; ++r) {
#if !(CONFIG_FAST_UNALIGNED)
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
dst[3] = src[3];
#else
*(uint32_t *)dst = *(const uint32_t *)src;
#endif
src += src_stride;
dst += dst_stride;
}
}
}
void vp9_convolve_avg(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h) {
int x, y;
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
dst[x] = (dst[x] + src[x] + 1) >> 1;
}
src += src_stride;
dst += dst_stride;
}
}
This diff is collapsed.
......@@ -21,10 +21,17 @@
#define SUBPEL_SHIFTS 16
extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2];
extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8];
extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8];
extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];
// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
// filter kernel as a 2 tap filter.
#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \
sizeof(vp9_bilinear_filters[0][0]))
#define BF_OFFSET (BF_LENGTH / 2 - 1)
#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET)
#endif // VP9_COMMON_VP9_FILTER_H_
......@@ -87,8 +87,8 @@ unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
uint8_t temp2[2 * 16];
const int16_t *HFilter, *VFilter;
HFilter = vp9_bilinear_filters[xoffset];
VFilter = vp9_bilinear_filters[yoffset];
HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, FData3,
src_pixels_per_line, 1, 3, 16, HFilter);
......@@ -108,8 +108,8 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
uint8_t temp2[2 * 16];
const int16_t *HFilter, *VFilter;
HFilter = vp9_bilinear_filters[xoffset];
VFilter = vp9_bilinear_filters[yoffset];
HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, FData3,
src_pixels_per_line, 1, 17, 2, HFilter);
......
This diff is collapsed.
......@@ -14,6 +14,8 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_onyxc_int.h"
struct subpix_fn_table;
extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
uint8_t *dst_y,
int dst_ystride,
......@@ -64,10 +66,10 @@ extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
vp9_subpix_fn_t sppf);
struct subpix_fn_table *sppf);
extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
vp9_subpix_fn_t sppf);
struct subpix_fn_table *sppf);
extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
int pitch);
......
......@@ -23,21 +23,6 @@ EOF
}
forward_decls vp9_common_forward_decls
prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
# At the very least, MSVC 2008 has compiler bug exhibited by this code; code
# compiles warning free but a dissassembly of generated code show bugs. To be
# on the safe side, only enabled when compiled with 'gcc'.
if [ "$CONFIG_GCC" = "yes" ]; then
specialize vp9_filter_block2d_4x4_8 sse4_1 sse2
fi
specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2
specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2
specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2
#
# Dequant
#
......@@ -86,27 +71,17 @@ specialize vp9_dequant_idct_add_uv_block_16x16
#
# RECON
#
prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_copy_mem16x16 mmx sse2 dspr2
vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_copy_mem8x8 mmx dspr2
vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_copy_mem8x4 mmx
prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_avg_mem16x16
prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_avg_mem8x8
prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_copy_mem8x4 mmx dspr2
vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
specialize vp9_recon_b
......@@ -287,111 +262,6 @@ specialize vp9_convolve8_avg_horiz
prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_vert
prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict16x16
prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict8x8
prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict_avg16x16
prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict_avg8x8
prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict_avg4x4
prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict8x4
prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict4x4
prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict16x16_sharp
prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict8x8_sharp
prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict_avg16x16_sharp
prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict_avg8x8_sharp
prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict_avg4x4_sharp
prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict8x4_sharp
prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict4x4_sharp
prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict16x16_smooth
prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict8x8_smooth
prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict_avg16x16_smooth
prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict_avg8x8_smooth
prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict_avg4x4_smooth
prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict8x4_smooth
prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_eighttap_predict4x4_smooth
prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_sixtap_predict16x16
prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_sixtap_predict8x8
prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_sixtap_predict_avg16x16
prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_sixtap_predict_avg8x8
prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_sixtap_predict8x4
prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_sixtap_predict4x4
prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_sixtap_predict_avg4x4
prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_bilinear_predict16x16 sse2
prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_bilinear_predict8x8 sse2
prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_bilinear_predict_avg16x16
prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_bilinear_predict_avg8x8
prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_bilinear_predict8x4
prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_bilinear_predict4x4
prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
specialize vp9_bilinear_predict_avg4x4
#
# dct
#
......
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_VP9_SUBPIXEL_H_
#define VP9_COMMON_VP9_SUBPIXEL_H_
#define prototype_subpixel_predict(sym) \
void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \
uint8_t *dst, int dst_pitch)
typedef prototype_subpixel_predict((*vp9_subpix_fn_t));
#endif // VP9_COMMON_VP9_SUBPIXEL_H_
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
%include "vpx_ports/x86_abi_support.asm"
%define BLOCK_HEIGHT_WIDTH 4
%define vp9_filter_weight 128
%define VP9_FILTER_SHIFT 7
;void vp9_filter_block1d_h6_mmx
;(
; unsigned char *src_ptr,
; unsigned short *output_ptr,
; unsigned int src_pixels_per_line,
; unsigned int pixel_step,
; unsigned int output_height,
; unsigned int output_width,
; short * vp9_filter
;)
global sym(vp9_filter_block1d_h6_mmx) PRIVATE
sym(vp9_filter_block1d_h6_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 7
GET_GOT rbx
push rsi
push rdi
; end prolog
mov rdx, arg(6) ;vp9_filter
movq mm1, [rdx + 16] ; do both the negative taps first!!!
movq mm2, [rdx + 32] ;
movq mm6, [rdx + 48] ;
movq mm7, [rdx + 64] ;
mov rdi, arg(1) ;output_ptr
mov rsi, arg(0) ;src_ptr
movsxd rcx, dword ptr arg(4) ;output_height
movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
.nextrow:
movq mm3, [rsi-2] ; mm3 = p-2..p5
movq mm4, mm3 ; mm4 = p-2..p5
psrlq mm3, 8 ; mm3 = p-1..p5
punpcklbw mm3, mm0 ; mm3 = p-1..p2
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
movq mm5, mm4 ; mm5 = p-2..p5
punpckhbw mm4, mm0 ; mm5 = p2..p5
pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
paddsw mm3, mm4 ; mm3 += mm5
movq mm4, mm5 ; mm4 = p-2..p5;
psrlq mm5, 16 ; mm5 = p0..p5;
punpcklbw mm5, mm0 ; mm5 = p0..p3
pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
paddsw mm3, mm5 ; mm3 += mm5
movq mm5, mm4 ; mm5 = p-2..p5
psrlq mm4, 24 ; mm4 = p1..p5
punpcklbw mm4, mm0 ; mm4 = p1..p4
pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
paddsw mm3, mm4 ; mm3 += mm5
; do outer positive taps
movd mm4, [rsi+3]
punpcklbw mm4, mm0 ; mm5 = p3..p6
pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
paddsw mm3, mm4 ; mm3 += mm5
punpcklbw mm5, mm0 ; mm5 = p-2..p1
pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
paddsw mm3, mm5 ; mm3 += mm5
paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
packuswb mm3, mm0 ; pack and unpack to saturate
punpcklbw mm3, mm0 ;
movq [rdi], mm3 ; store the results in the destination
%if ABI_IS_32BIT
add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
add rdi, rax;
%else
movsxd r8, dword ptr arg(2) ;src_pixels_per_line
add rdi, rax;
add rsi, r8 ; next line
%endif
dec rcx ; decrement count
jnz .nextrow ; next row
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1dc_v6_mmx
;(
; short *src_ptr,
; unsigned char *output_ptr,
; int output_pitch,
; unsigned int pixels_per_line,
; unsigned int pixel_step,
; unsigned int output_height,
; unsigned int output_width,
; short * vp9_filter
;)
global sym(vp9_filter_block1dc_v6_mmx) PRIVATE
sym(vp9_filter_block1dc_v6_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 8
GET_GOT rbx
push rsi
push rdi
; end prolog
movq mm5, [GLOBAL(rd)]
push rbx
mov rbx, arg(7) ;vp9_filter
movq mm1, [rbx + 16] ; do both the negative taps first!!!
movq mm2, [rbx + 32] ;
movq mm6, [rbx + 48] ;
movq mm7, [rbx + 64] ;
movsxd rdx, dword ptr arg(3) ;pixels_per_line
mov rdi, arg(1) ;output_ptr
mov rsi, arg(0) ;src_ptr
sub rsi, rdx
sub rsi, rdx
movsxd rcx, DWORD PTR arg(5) ;output_height
movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
pxor mm0, mm0 ; mm0 = 00000000
.nextrow_cv:
movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
paddsw mm3, mm4 ; mm3 += mm4
movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
paddsw mm3, mm4 ; mm3 += mm4
movq mm4, [rsi] ; mm4 = p0..p3 = row -2
pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
paddsw mm3, mm4 ; mm3 += mm4
add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
paddsw mm3, mm4 ; mm3 += mm4
movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
paddsw mm3, mm4 ; mm3 += mm4
paddsw mm3, mm5 ; mm3 += round value
psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
packuswb mm3, mm0 ; pack and saturate
movd [rdi],mm3 ; store the results in the destination
; the subsequent iterations repeat 3 out of 4 of these reads. Since the
; recon block should be in cache this shouldn't cost much. Its obviously
; avoidable!!!.
lea rdi, [rdi+rax] ;
dec rcx ; decrement count
jnz .nextrow_cv ; next row
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
UNSHADOW_ARGS
pop rbp
ret
SECTION_RODATA
align 16
rd:
times 4 dw 0x40
align 16
global HIDDEN_DATA(sym(vp9_six_tap_mmx))
sym(vp9_six_tap_mmx):
times 8 dw 0
times 8 dw 0
times 8 dw 128
times 8 dw 0
times 8 dw 0
times 8 dw 0
times 8 dw 0
times 8 dw -6
times 8 dw 123
times 8 dw 12
times 8 dw -1
times 8 dw 0
times 8 dw 2
times 8 dw -11
times 8 dw 108
times 8 dw 36
times 8 dw -8
times 8 dw 1
times 8 dw 0
times 8 dw -9
times 8 dw 93
times 8 dw 50
times 8 dw -6
times 8 dw 0
times 8 dw 3
times 8 dw -16
times 8 dw 77
times 8 dw 77
times 8 dw -16
times 8 dw 3
times 8 dw 0
times 8 dw -6
times 8 dw 50
times 8 dw 93
times 8 dw -9
times 8 dw 0
times 8 dw 1
times 8 dw -8
times 8 dw 36
times 8 dw 108
times 8 dw -11
times 8 dw 2
times 8 dw 0
times 8 dw -1
times 8 dw 12
times 8 dw 123
times 8 dw -6
times 8 dw 0
This diff is collapsed.
This diff is collapsed.
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
/* Note:
*
* This platform is commonly built for runtime CPU detection. If you modify
* any of the function mappings present in this file, be sure to also update
* them in the function pointer initialization code
*/
#if HAVE_MMX
extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);
extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp9_subpix_sixtap16x16
#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
#undef vp9_subpix_sixtap8x8
#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx
#undef vp9_subpix_sixtap8x4
#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx
#undef vp9_subpix_sixtap4x4
#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx
#undef vp9_subpix_bilinear16x16
#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
#endif
#endif
#if HAVE_SSE2
extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);
extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);
extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);
extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);
extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp9_subpix_sixtap16x16
#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2
#undef vp9_subpix_sixtap8x8
#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2
#undef vp9_subpix_sixtap8x4
#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2
#undef vp9_subpix_bilinear16x16
#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2
#undef vp9_subpix_bilinear8x8
#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2
#endif
#endif
#if HAVE_SSSE3
extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);
extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);
extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);
extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);
extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);
extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp9_subpix_sixtap16x16
#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3
#undef vp9_subpix_sixtap8x8
#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3
#undef vp9_subpix_sixtap8x4
#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3
#undef vp9_subpix_sixtap4x4
#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3
#undef vp9_subpix_bilinear16x16
#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3
#undef vp9_subpix_bilinear8x8
#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3