From 7a07eea13fc94036f54cdb6f1233b9af8b094ced Mon Sep 17 00:00:00 2001 From: John Koleszar <jkoleszar@google.com> Date: Mon, 28 Jan 2013 16:59:03 -0800 Subject: [PATCH] Convert subpixel filters to use convolve framework Update the code to call the new convolution functions to do subpixel prediction rather than the existing functions. Remove the old C and assembly code, since it is unused. This causes a 50% performance reduction on the decoder, but that will be resolved when the asm for the new functions is available. There is no consensus for whether 6-tap or 2-tap predictors will be supported in the final codec, so these filters are implemented in terms of the 8-tap code, so that quality testing of these modes can continue. Implementing the lower complexity algorithms is a simple exercise, should it be necessary. This code produces slightly better results in the EIGHTTAP_SMOOTH case, since the filter is now applied in only one direction when the subpel motion is only in one direction. Like the previous code, the filtering is skipped entirely on full-pel MVs. This combination seems to give the best quality gains, but this may be indicative of a bug in the encoder's filter selection, since the encoder could achieve the result of skipping the filtering on full-pel by selecting one of the other filters. This should be revisited. Quality gains on derf positive on almost all clips. The only clip that seemed to be hurt at all datarates was football (-0.115% PSNR average, -0.587% min). Overall averages 0.375% PSNR, 0.347% SSIM. Change-Id: I7d469716091b1d89b4b08adde5863999319d69ff --- vp9/common/generic/vp9_systemdependent.c | 2 - vp9/common/ppc/vp9_systemdependent.c | 1 - vp9/common/vp9_blockd.h | 11 +- vp9/common/vp9_convolve.c | 46 + vp9/common/vp9_filter.c | 1119 +--------------- vp9/common/vp9_filter.h | 11 +- vp9/common/vp9_findnearmv.c | 8 +- vp9/common/vp9_reconinter.c | 324 ++--- vp9/common/vp9_reconinter.h | 6 +- vp9/common/vp9_rtcd_defs.sh | 136 +- vp9/common/vp9_subpixel.h | 20 - vp9/common/x86/vp9_asm_stubs.c | 566 -------- vp9/common/x86/vp9_filter_sse2.c | 290 ----- vp9/common/x86/vp9_filter_sse4.c | 362 ------ vp9/common/x86/vp9_subpixel_8t_ssse3.asm | 550 -------- vp9/common/x86/vp9_subpixel_mmx.asm | 268 ---- vp9/common/x86/vp9_subpixel_sse2.asm | 1372 -------------------- vp9/common/x86/vp9_subpixel_ssse3.asm | 1515 ---------------------- vp9/common/x86/vp9_subpixel_x86.h | 109 -- vp9/encoder/vp9_onyx_if.c | 2 + vp9/encoder/vp9_rdopt.c | 4 +- vp9/encoder/vp9_temporal_filter.c | 31 +- vp9/encoder/vp9_variance_c.c | 28 +- vp9/vp9_common.mk | 15 - 24 files changed, 261 insertions(+), 6535 deletions(-) delete mode 100644 vp9/common/vp9_subpixel.h delete mode 100644 vp9/common/x86/vp9_filter_sse2.c delete mode 100644 vp9/common/x86/vp9_filter_sse4.c delete mode 100644 vp9/common/x86/vp9_subpixel_8t_ssse3.asm delete mode 100644 vp9/common/x86/vp9_subpixel_mmx.asm delete mode 100644 vp9/common/x86/vp9_subpixel_sse2.asm delete mode 100644 vp9/common/x86/vp9_subpixel_ssse3.asm delete mode 100644 vp9/common/x86/vp9_subpixel_x86.h diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c index b02f3f0834..79092cd0eb 100644 --- a/vp9/common/generic/vp9_systemdependent.c +++ b/vp9/common/generic/vp9_systemdependent.c @@ -11,8 +11,6 @@ #include "./vpx_config.h" #include "vp9_rtcd.h" -#include "vp9/common/vp9_subpixel.h" -#include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_onyxc_int.h" void vp9_machine_specific_config(VP9_COMMON *ctx) { diff --git a/vp9/common/ppc/vp9_systemdependent.c b/vp9/common/ppc/vp9_systemdependent.c index 106a2b763e..02035191f3 100644 --- a/vp9/common/ppc/vp9_systemdependent.c +++ b/vp9/common/ppc/vp9_systemdependent.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vp9/common/vp9_subpixel.h" #include "vp9/common/vp9_loopfilter.h" #include "recon.h" #include "vp9/common/vp9_onyxc_int.h" diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index b34f308d3a..241cb8a13f 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -16,9 +16,9 @@ void vpx_log(const char *format, ...); #include "./vpx_config.h" #include "vpx_scale/yv12config.h" +#include "vp9/common/vp9_convolve.h" #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_treecoder.h" -#include "vp9/common/vp9_subpixel.h" #include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" @@ -393,15 +393,8 @@ typedef struct macroblockd { void (*inv_walsh4x4_1)(int16_t *in, int16_t *out); void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out); + struct subpix_fn_table subpix; - vp9_subpix_fn_t subpixel_predict4x4; - vp9_subpix_fn_t subpixel_predict8x4; - vp9_subpix_fn_t subpixel_predict8x8; - vp9_subpix_fn_t subpixel_predict16x16; - vp9_subpix_fn_t subpixel_predict_avg4x4; - vp9_subpix_fn_t subpixel_predict_avg8x4; - vp9_subpix_fn_t subpixel_predict_avg8x8; - vp9_subpix_fn_t subpixel_predict_avg16x16; int allow_high_precision_mv; int corrupted; diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c index ed188c3f25..f21f1d84e8 100644 --- a/vp9/common/vp9_convolve.c +++ b/vp9/common/vp9_convolve.c @@ -297,3 +297,49 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); } + +void vp9_convolve_copy(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + if (h == 16) { + vp9_copy_mem16x16(src, src_stride, dst, dst_stride); + } else if (h == 8) { + vp9_copy_mem8x8(src, src_stride, dst, dst_stride); + } else if (w == 8) { + vp9_copy_mem8x4(src, src_stride, dst, dst_stride); + } else { + // 4x4 + int r; + + for (r = 0; r < 4; ++r) { +#if !(CONFIG_FAST_UNALIGNED) + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; +#else + *(uint32_t *)dst = *(const uint32_t *)src; +#endif + src += src_stride; + dst += dst_stride; + } + } +} + +void vp9_convolve_avg(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = (dst[x] + src[x] + 1) >> 1; + } + src += src_stride; + dst += dst_stride; + } +} diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c index 07d8a169f6..5e425895fd 100644 --- a/vp9/common/vp9_filter.c +++ b/vp9/common/vp9_filter.c @@ -15,23 +15,23 @@ #include "vp9_rtcd.h" #include "vp9/common/vp9_common.h" -DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = { - { 128, 0 }, - { 120, 8 }, - { 112, 16 }, - { 104, 24 }, - { 96, 32 }, - { 88, 40 }, - { 80, 48 }, - { 72, 56 }, - { 64, 64 }, - { 56, 72 }, - { 48, 80 }, - { 40, 88 }, - { 32, 96 }, - { 24, 104 }, - { 16, 112 }, - { 8, 120 } +DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { 0, 0, 0, 120, 8, 0, 0, 0 }, + { 0, 0, 0, 112, 16, 0, 0, 0 }, + { 0, 0, 0, 104, 24, 0, 0, 0 }, + { 0, 0, 0, 96, 32, 0, 0, 0 }, + { 0, 0, 0, 88, 40, 0, 0, 0 }, + { 0, 0, 0, 80, 48, 0, 0, 0 }, + { 0, 0, 0, 72, 56, 0, 0, 0 }, + { 0, 0, 0, 64, 64, 0, 0, 0 }, + { 0, 0, 0, 56, 72, 0, 0, 0 }, + { 0, 0, 0, 48, 80, 0, 0, 0 }, + { 0, 0, 0, 40, 88, 0, 0, 0 }, + { 0, 0, 0, 32, 96, 0, 0, 0 }, + { 0, 0, 0, 24, 104, 0, 0, 0 }, + { 0, 0, 0, 16, 112, 0, 0, 0 }, + { 0, 0, 0, 8, 120, 0, 0, 0 } }; #define FILTER_ALPHA 0 @@ -144,1072 +144,21 @@ DECLARE_ALIGNED(16, const int16_t, { 1, -2, -7, 37, 80, 28, -8, -1} }; -DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = { - {0, 0, 128, 0, 0, 0}, - {1, -5, 125, 8, -2, 1}, - {1, -8, 122, 17, -5, 1}, - {2, -11, 116, 27, -8, 2}, - {3, -14, 110, 37, -10, 2}, - {3, -15, 103, 47, -12, 2}, - {3, -16, 95, 57, -14, 3}, - {3, -16, 86, 67, -15, 3}, - {3, -16, 77, 77, -16, 3}, - {3, -15, 67, 86, -16, 3}, - {3, -14, 57, 95, -16, 3}, - {2, -12, 47, 103, -15, 3}, - {2, -10, 37, 110, -14, 3}, - {2, -8, 27, 116, -11, 2}, - {1, -5, 17, 122, -8, 1}, - {1, -2, 8, 125, -5, 1} +DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]) = { + {0, 0, 0, 128, 0, 0, 0, 0}, + {0, 1, -5, 125, 8, -2, 1, 0}, + {0, 1, -8, 122, 17, -5, 1, 0}, + {0, 2, -11, 116, 27, -8, 2, 0}, + {0, 3, -14, 110, 37, -10, 2, 0}, + {0, 3, -15, 103, 47, -12, 2, 0}, + {0, 3, -16, 95, 57, -14, 3, 0}, + {0, 3, -16, 86, 67, -15, 3, 0}, + {0, 3, -16, 77, 77, -16, 3, 0}, + {0, 3, -15, 67, 86, -16, 3, 0}, + {0, 3, -14, 57, 95, -16, 3, 0}, + {0, 2, -12, 47, 103, -15, 3, 0}, + {0, 2, -10, 37, 110, -14, 3, 0}, + {0, 2, -8, 27, 116, -11, 2, 0}, + {0, 1, -5, 17, 122, -8, 1, 0}, + {0, 1, -2, 8, 125, -5, 1, 0} }; - -static void filter_block2d_first_pass_6(uint8_t *src_ptr, - int *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr++; - } - - /* Next row... */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -static void filter_block2d_second_pass_6(int *src_ptr, - uint8_t *output_ptr, - int output_pitch, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - /* Apply filter */ - temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr++; - } - - /* Start next row */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_pitch; - } -} - -/* - * The only functional difference between filter_block2d_second_pass() - * and this function is that filter_block2d_second_pass() does a sixtap - * filter on the input and stores it in the output. This function - * (filter_block2d_second_pass_avg()) does a sixtap filter on the input, - * and then averages that with the content already present in the output - * ((filter_result + dest + 1) >> 1) and stores that in the output. - */ -static void filter_block2d_second_pass_avg_6(int *src_ptr, - uint8_t *output_ptr, - int output_pitch, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - /* Apply filter */ - temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - output_ptr[j] = (clip_pixel(temp >> VP9_FILTER_SHIFT) + - output_ptr[j] + 1) >> 1; - src_ptr++; - } - - /* Start next row */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_pitch; - } -} - -#define Interp_Extend 3 -static void filter_block2d_6(uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int output_pitch, - const int16_t *HFilter, - const int16_t *VFilter) { - int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, - output_pitch, 4, 4, 4, 4, VFilter); -} - - -void vp9_sixtap_predict4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, - VFilter); -} - -/* - * The difference between filter_block2d_6() and filter_block2d_avg_6 is - * that filter_block2d_6() does a 6-tap filter and stores it in the output - * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and - * then averages that with the content already present in the output - * ((filter_result + dest + 1) >> 1) and stores that in the output. - */ -static void filter_block2d_avg_6(uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int output_pitch, - const int16_t *HFilter, - const int16_t *VFilter) { - int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr, - output_pitch, 4, 4, 4, 4, VFilter); -} - -void vp9_sixtap_predict_avg4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, - HFilter, VFilter); -} - -void vp9_sixtap_predict8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 8, 8, 8, 8, VFilter); - -} - -void vp9_sixtap_predict_avg8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 8, 8, 8, 8, VFilter); -} - -void vp9_sixtap_predict8x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 3 + Interp_Extend * 2, 8, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 8, 8, 4, 8, VFilter); -} - -void vp9_sixtap_predict16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 16, 16, 16, 16, VFilter); -} - -void vp9_sixtap_predict_avg16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 16, 16, 16, 16, VFilter); -} - -typedef enum { - VPX_FILTER_4x4 = 0, - VPX_FILTER_8x8 = 1, - VPX_FILTER_8x4 = 2, - VPX_FILTER_16x16 = 3, -} filter_size_t; - -static const unsigned int filter_size_to_wh[][2] = { - {4, 4}, - {8, 8}, - {8, 4}, - {16,16}, -}; - -static void filter_block2d_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter, - const int16_t *VFilter, - const filter_size_t filter_size, - uint8_t *dst_ptr, - unsigned int dst_stride) { - const unsigned int output_width = filter_size_to_wh[filter_size][0]; - const unsigned int output_height = filter_size_to_wh[filter_size][1]; - - // Between passes, we use an intermediate buffer whose height is extended to - // have enough horizontally filtered values as input for the vertical pass. - // This buffer is allocated to be big enough for the largest block type we - // support. - const int kInterp_Extend = 4; - const unsigned int intermediate_height = - (kInterp_Extend - 1) + output_height + kInterp_Extend; - - /* Size of intermediate_buffer is max_intermediate_height * filter_max_width, - * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height - * + kInterp_Extend - * = 3 + 16 + 4 - * = 23 - * and filter_max_width = 16 - */ - uint8_t intermediate_buffer[23 * 16]; - const int intermediate_next_stride = 1 - intermediate_height * output_width; - - // Horizontal pass (src -> transposed intermediate). - { - uint8_t *output_ptr = intermediate_buffer; - const int src_next_row_stride = src_stride - output_width; - unsigned int i, j; - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - for (i = 0; i < intermediate_height; i++) { - for (j = 0; j < output_width; j++) { - // Apply filter... - int temp = ((int)src_ptr[0] * HFilter[0]) + - ((int)src_ptr[1] * HFilter[1]) + - ((int)src_ptr[2] * HFilter[2]) + - ((int)src_ptr[3] * HFilter[3]) + - ((int)src_ptr[4] * HFilter[4]) + - ((int)src_ptr[5] * HFilter[5]) + - ((int)src_ptr[6] * HFilter[6]) + - ((int)src_ptr[7] * HFilter[7]) + - (VP9_FILTER_WEIGHT >> 1); // Rounding - - // Normalize back to 0-255... - *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr++; - output_ptr += intermediate_height; - } - src_ptr += src_next_row_stride; - output_ptr += intermediate_next_stride; - } - } - - // Vertical pass (transposed intermediate -> dst). - { - uint8_t *src_ptr = intermediate_buffer; - const int dst_next_row_stride = dst_stride - output_width; - unsigned int i, j; - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - // Apply filter... - int temp = ((int)src_ptr[0] * VFilter[0]) + - ((int)src_ptr[1] * VFilter[1]) + - ((int)src_ptr[2] * VFilter[2]) + - ((int)src_ptr[3] * VFilter[3]) + - ((int)src_ptr[4] * VFilter[4]) + - ((int)src_ptr[5] * VFilter[5]) + - ((int)src_ptr[6] * VFilter[6]) + - ((int)src_ptr[7] * VFilter[7]) + - (VP9_FILTER_WEIGHT >> 1); // Rounding - - // Normalize back to 0-255... - *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr += intermediate_height; - } - src_ptr += intermediate_next_stride; - dst_ptr += dst_next_row_stride; - } - } -} - -void vp9_filter_block2d_4x4_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_4x4, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_8x4_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_8x4, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_8x8_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_8x8, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_16x16_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_16x16, dst_ptr, dst_stride); -} - -static void block2d_average_c(uint8_t *src, - unsigned int src_stride, - uint8_t *output_ptr, - unsigned int output_stride, - const filter_size_t filter_size) { - const unsigned int output_width = filter_size_to_wh[filter_size][0]; - const unsigned int output_height = filter_size_to_wh[filter_size][1]; - - unsigned int i, j; - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1; - } - output_ptr += output_stride; - } -} - -#define block2d_average block2d_average_c - -void vp9_eighttap_predict4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_8[xoffset]; - VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - uint8_t tmp[4 * 4]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 4); - block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); -} - -void vp9_eighttap_predict4x4_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_8s[xoffset]; - VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict4x4_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_8lp[xoffset]; - VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg4x4_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - uint8_t tmp[4 * 4]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 4); - block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); -} - -void vp9_eighttap_predict_avg4x4_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - uint8_t tmp[4 * 4]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 4); - block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); -} - - -void vp9_eighttap_predict8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x8_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x8_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - uint8_t tmp[8 * 8]; - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 8); - block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); -} - -void vp9_eighttap_predict_avg8x8_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - uint8_t tmp[8 * 8]; - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 8); - block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); -} - -void vp9_eighttap_predict_avg8x8_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - uint8_t tmp[8 * 8]; - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 8); - block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); -} - -void vp9_eighttap_predict8x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x4_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x4_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict16x16_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict16x16_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16); - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - tmp, 16); - block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); -} - -void vp9_eighttap_predict_avg16x16_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16); - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - tmp, 16); - block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); -} - -void vp9_eighttap_predict_avg16x16_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16); - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - tmp, 16); - block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_first_pass - * - * INPUTS : uint8_t *src_ptr : Pointer to source block. - * uint32_t src_stride : Stride of source block. - * uint32_t height : Block height. - * uint32_t width : Block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : int32_t *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block - * in the horizontal direction to produce the filtered output - * block. Used to implement first-pass of 2-D separable filter. - * - * SPECIAL NOTES : Produces int32_t output to retain precision for next pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * - ****************************************************************************/ -static void filter_block2d_bil_first_pass(uint8_t *src_ptr, - uint16_t *dst_ptr, - unsigned int src_stride, - unsigned int height, - unsigned int width, - const int16_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply bilinear filter */ - dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[1] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; - src_ptr++; - } - - /* Next row... */ - src_ptr += src_stride - width; - dst_ptr += width; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_second_pass - * - * INPUTS : int32_t *src_ptr : Pointer to source block. - * uint32_t dst_pitch : Destination block pitch. - * uint32_t height : Block height. - * uint32_t width : Block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : uint16_t *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block - * in the vertical direction to produce the filtered output - * block. Used to implement second-pass of 2-D separable filter. - * - * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * - ****************************************************************************/ -static void filter_block2d_bil_second_pass(uint16_t *src_ptr, - uint8_t *dst_ptr, - int dst_pitch, - unsigned int height, - unsigned int width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply filter */ - temp = ((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[width] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2); - dst_ptr[j] = (unsigned int)(temp >> VP9_FILTER_SHIFT); - src_ptr++; - } - - /* Next row... */ - dst_ptr += dst_pitch; - } -} - -/* - * As before for filter_block2d_second_pass_avg(), the functional difference - * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg() - * is that filter_block2d_bil_second_pass() does a bilinear filter on input - * and stores the result in output; filter_block2d_bil_second_pass_avg(), - * instead, does a bilinear filter on input, averages the resulting value - * with the values already present in the output and stores the result of - * that back into the output ((filter_result + dest + 1) >> 1). - */ -static void filter_block2d_bil_second_pass_avg(uint16_t *src_ptr, - uint8_t *dst_ptr, - int dst_pitch, - unsigned int height, - unsigned int width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply filter */ - temp = (((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[width] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; - dst_ptr[j] = (unsigned int)((temp + dst_ptr[j] + 1) >> 1); - src_ptr++; - } - - /* Next row... */ - dst_ptr += dst_pitch; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil - * - * INPUTS : uint8_t *src_ptr : Pointer to source block. - * uint32_t src_pitch : Stride of source block. - * uint32_t dst_pitch : Stride of destination block. - * int32_t *HFilter : Array of 2 horizontal filter taps. - * int32_t *VFilter : Array of 2 vertical filter taps. - * int32_t Width : Block width - * int32_t Height : Block height - * - * OUTPUTS : uint16_t *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : 2-D filters an input block by applying a 2-tap - * bi-linear filter horizontally followed by a 2-tap - * bi-linear filter vertically on the result. - * - * SPECIAL NOTES : The largest block size can be handled here is 16x16 - * - ****************************************************************************/ -static void filter_block2d_bil(uint8_t *src_ptr, - uint8_t *dst_ptr, - unsigned int src_pitch, - unsigned int dst_pitch, - const int16_t *HFilter, - const int16_t *VFilter, - int Width, - int Height) { - - uint16_t FData[17 * 16]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); - - /* then 1-D vertically... */ - filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter); -} - -static void filter_block2d_bil_avg(uint8_t *src_ptr, - uint8_t *dst_ptr, - unsigned int src_pitch, - unsigned int dst_pitch, - const int16_t *HFilter, - const int16_t *VFilter, - int Width, - int Height) { - uint16_t FData[17 * 16]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); - - /* then 1-D vertically... */ - filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter); -} - -void vp9_bilinear_predict4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); -} - -void vp9_bilinear_predict_avg4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 4, 4); -} - -void vp9_bilinear_predict8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); - -} - -void vp9_bilinear_predict_avg8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 8, 8); -} - -void vp9_bilinear_predict8x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); - -} - -void vp9_bilinear_predict16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); -} - -void vp9_bilinear_predict_avg16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 16, 16); -} diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h index cd666578d3..1ccfdaac25 100644 --- a/vp9/common/vp9_filter.h +++ b/vp9/common/vp9_filter.h @@ -21,10 +21,17 @@ #define SUBPEL_SHIFTS 16 -extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2]; -extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]; +extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]; extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]; extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]; extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]; +// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear +// filter kernel as a 2 tap filter. +#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \ + sizeof(vp9_bilinear_filters[0][0])) +#define BF_OFFSET (BF_LENGTH / 2 - 1) +#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET) + #endif // VP9_COMMON_VP9_FILTER_H_ diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c index 88f2ea9c18..f2c8891081 100644 --- a/vp9/common/vp9_findnearmv.c +++ b/vp9/common/vp9_findnearmv.c @@ -87,8 +87,8 @@ unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr, uint8_t temp2[2 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 3, 16, HFilter); @@ -108,8 +108,8 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr, uint8_t temp2[2 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 2, HFilter); diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 20de7b7f1d..d4435d872d 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -8,66 +8,58 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERPOLATIONFILTERTYPE mcomp_filter_type, VP9_COMMON *cm) { + // TODO(agrange): Investigate the best choice of functions to use here + // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what + // to do at full-pel offsets. The current selection, where the filter is + // applied in one direction only, and not at all for 0,0, seems to give the + // best quality, but it may be worth trying an additional mode that does + // do the filtering on full-pel. + xd->subpix.predict[0][0][0] = vp9_convolve_copy; + xd->subpix.predict[0][0][1] = vp9_convolve_avg; + xd->subpix.predict[0][1][0] = vp9_convolve8_vert; + xd->subpix.predict[0][1][1] = vp9_convolve8_avg_vert; + xd->subpix.predict[1][0][0] = vp9_convolve8_horiz; + xd->subpix.predict[1][0][1] = vp9_convolve8_avg_horiz; + xd->subpix.predict[1][1][0] = vp9_convolve8; + xd->subpix.predict[1][1][1] = vp9_convolve8_avg; + + xd->subpix.x_step_q4 = 16; + xd->subpix.y_step_q4 = 16; + switch (mcomp_filter_type) { + case EIGHTTAP: + case SWITCHABLE: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8; + break; + case EIGHTTAP_SMOOTH: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp; + break; + case EIGHTTAP_SHARP: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s; + break; + case BILINEAR: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters; + break; #if CONFIG_ENABLE_6TAP - if (mcomp_filter_type == SIXTAP) { - xd->subpixel_predict4x4 = vp9_sixtap_predict4x4; - xd->subpixel_predict8x4 = vp9_sixtap_predict8x4; - xd->subpixel_predict8x8 = vp9_sixtap_predict8x8; - xd->subpixel_predict16x16 = vp9_sixtap_predict16x16; - xd->subpixel_predict_avg4x4 = vp9_sixtap_predict_avg4x4; - xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16; - } else { + case SIXTAP: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6; + break; #endif - if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) { - xd->subpixel_predict4x4 = vp9_eighttap_predict4x4; - xd->subpixel_predict8x4 = vp9_eighttap_predict8x4; - xd->subpixel_predict8x8 = vp9_eighttap_predict8x8; - xd->subpixel_predict16x16 = vp9_eighttap_predict16x16; - xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4; - xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16; - } else if (mcomp_filter_type == EIGHTTAP_SMOOTH) { - xd->subpixel_predict4x4 = vp9_eighttap_predict4x4_smooth; - xd->subpixel_predict8x4 = vp9_eighttap_predict8x4_smooth; - xd->subpixel_predict8x8 = vp9_eighttap_predict8x8_smooth; - xd->subpixel_predict16x16 = vp9_eighttap_predict16x16_smooth; - xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_smooth; - xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_smooth; - xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_smooth; - } else if (mcomp_filter_type == EIGHTTAP_SHARP) { - xd->subpixel_predict4x4 = vp9_eighttap_predict4x4_sharp; - xd->subpixel_predict8x4 = vp9_eighttap_predict8x4_sharp; - xd->subpixel_predict8x8 = vp9_eighttap_predict8x8_sharp; - xd->subpixel_predict16x16 = vp9_eighttap_predict16x16_sharp; - xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_sharp; - xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp; - xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c; - } else { - xd->subpixel_predict4x4 = vp9_bilinear_predict4x4; - xd->subpixel_predict8x4 = vp9_bilinear_predict8x4; - xd->subpixel_predict8x8 = vp9_bilinear_predict8x8; - xd->subpixel_predict16x16 = vp9_bilinear_predict16x16; - xd->subpixel_predict_avg4x4 = vp9_bilinear_predict_avg4x4; - xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16; - } -#if CONFIG_ENABLE_6TAP } -#endif } -void vp9_copy_mem16x16_c(uint8_t *src, +void vp9_copy_mem16x16_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride) { @@ -93,10 +85,10 @@ void vp9_copy_mem16x16_c(uint8_t *src, dst[15] = src[15]; #else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; - ((uint32_t *)dst)[2] = ((uint32_t *)src)[2]; - ((uint32_t *)dst)[3] = ((uint32_t *)src)[3]; + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; + ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2]; + ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3]; #endif src += src_stride; @@ -104,25 +96,7 @@ void vp9_copy_mem16x16_c(uint8_t *src, } } -void vp9_avg_mem16x16_c(uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 16; r++) { - int n; - - for (n = 0; n < 16; n++) { - dst[n] = (dst[n] + src[n] + 1) >> 1; - } - - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x8_c(uint8_t *src, +void vp9_copy_mem8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride) { @@ -139,33 +113,15 @@ void vp9_copy_mem8x8_c(uint8_t *src, dst[6] = src[6]; dst[7] = src[7]; #else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; #endif src += src_stride; dst += dst_stride; } } -void vp9_avg_mem8x8_c(uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 8; r++) { - int n; - - for (n = 0; n < 8; n++) { - dst[n] = (dst[n] + src[n] + 1) >> 1; - } - - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x4_c(uint8_t *src, +void vp9_copy_mem8x4_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride) { @@ -182,16 +138,16 @@ void vp9_copy_mem8x4_c(uint8_t *src, dst[6] = src[6]; dst[7] = src[7]; #else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; #endif src += src_stride; dst += dst_stride; } } -void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) { - int r; +void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, + struct subpix_fn_table *subpix) { uint8_t *ptr_base; uint8_t *ptr; uint8_t *pred_ptr = d->predictor; @@ -199,30 +155,14 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) { ptr_base = *(d->base_pre); mv.as_int = d->bmi.as_mv.first.as_int; + ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + + (mv.as_mv.col >> 3); - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1, - pred_ptr, pitch); - } else { - ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - ptr = ptr_base; - - for (r = 0; r < 4; r++) { -#if !(CONFIG_FAST_UNALIGNED) - pred_ptr[0] = ptr[0]; - pred_ptr[1] = ptr[1]; - pred_ptr[2] = ptr[2]; - pred_ptr[3] = ptr[3]; -#else - *(uint32_t *)pred_ptr = *(uint32_t *)ptr; -#endif - pred_ptr += pitch; - ptr += d->pre_stride; - } - } + subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0]( + ptr, d->pre_stride, pred_ptr, pitch, + subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4, + subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4, + 4, 4); } /* @@ -232,8 +172,7 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) { * predictor of the second reference frame / motion vector. */ void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf) { - int r; + struct subpix_fn_table *subpix) { uint8_t *ptr_base; uint8_t *ptr; uint8_t *pred_ptr = d->predictor; @@ -241,26 +180,14 @@ void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, ptr_base = *(d->base_second_pre); mv.as_int = d->bmi.as_mv.second.as_int; + ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + + (mv.as_mv.col >> 3); - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1, - pred_ptr, pitch); - } else { - ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - ptr = ptr_base; - - for (r = 0; r < 4; r++) { - pred_ptr[0] = (pred_ptr[0] + ptr[0] + 1) >> 1; - pred_ptr[1] = (pred_ptr[1] + ptr[1] + 1) >> 1; - pred_ptr[2] = (pred_ptr[2] + ptr[2] + 1) >> 1; - pred_ptr[3] = (pred_ptr[3] + ptr[3] + 1) >> 1; - pred_ptr += pitch; - ptr += d->pre_stride; - } - } + subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1]( + ptr, d->pre_stride, pred_ptr, pitch, + subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4, + subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4, + 4, 4); } void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { @@ -274,12 +201,11 @@ void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + (mv.as_mv.col >> 3); - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); - } else { - vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch); - } + xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0]( + ptr, d->pre_stride, pred_ptr, pitch, + xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4, + 8, 8); } /* @@ -300,12 +226,11 @@ void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd, ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + (mv.as_mv.col >> 3); - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); - } else { - vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch); - } + xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1]( + ptr, d->pre_stride, pred_ptr, pitch, + xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4, + 8, 8); } static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { @@ -319,12 +244,11 @@ static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + (mv.as_mv.col >> 3); - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); - } else { - vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch); - } + xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0]( + ptr, d->pre_stride, pred_ptr, pitch, + xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4, + 8, 4); } /*encoder only*/ @@ -411,13 +335,13 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) { if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) build_inter_predictors2b(xd, d0, 8); else { - vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4); - vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4); + vp9_build_inter_predictors_b(d0, 8, &xd->subpix); + vp9_build_inter_predictors_b(d1, 8, &xd->subpix); } if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4); - vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4); + vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix); + vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix); } } } @@ -475,14 +399,11 @@ void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd, ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3); - if ((ymv.as_mv.row | ymv.as_mv.col) & 7) { - xd->subpixel_predict16x16(ptr, pre_stride, - (ymv.as_mv.col & 7) << 1, - (ymv.as_mv.row & 7) << 1, - dst_y, dst_ystride); - } else { - vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride); - } + xd->subpix.predict[!!(ymv.as_mv.col & 7)][!!(ymv.as_mv.row & 7)][0]( + ptr, pre_stride, dst_y, dst_ystride, + xd->subpix.filter_x[(ymv.as_mv.col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(ymv.as_mv.row & 7) << 1], xd->subpix.y_step_q4, + 16, 16); } void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd, @@ -523,15 +444,19 @@ void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd, uptr = xd->pre.u_buffer + offset; vptr = xd->pre.v_buffer + offset; - if (_o16x16mv.as_int & 0x000f000f) { - xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15, - _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride); - xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15, - _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride); - } else { - vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); - vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); - } + xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)] + [!!(_o16x16mv.as_mv.row & 15)][0]( + uptr, pre_stride, dst_u, dst_uvstride, + xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4, + xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4, + 8, 8); + + xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)] + [!!(_o16x16mv.as_mv.row & 15)][0]( + vptr, pre_stride, dst_v, dst_uvstride, + xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4, + xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4, + 8, 8); } @@ -714,12 +639,11 @@ void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd, ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); - if ((mv_row | mv_col) & 7) { - xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1, - (mv_row & 7) << 1, dst_y, dst_ystride); - } else { - vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride); - } + xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][1]( + ptr, pre_stride, dst_y, dst_ystride, + xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4, + 16, 16); } void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd, @@ -758,15 +682,17 @@ void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd, uptr = xd->second_pre.u_buffer + offset; vptr = xd->second_pre.v_buffer + offset; - if ((omv_row | omv_col) & 15) { - xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15, - omv_row & 15, dst_u, dst_uvstride); - xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15, - omv_row & 15, dst_v, dst_uvstride); - } else { - vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); - vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); - } + xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1]( + uptr, pre_stride, dst_u, dst_uvstride, + xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4, + xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4, + 8, 8); + + xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1]( + vptr, pre_stride, dst_v, dst_uvstride, + xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4, + xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4, + 8, 8); } void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd, @@ -835,13 +761,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) { if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) build_inter_predictors2b(xd, d0, 16); else { - vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict4x4); - vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict4x4); + vp9_build_inter_predictors_b(d0, 16, &xd->subpix); + vp9_build_inter_predictors_b(d1, 16, &xd->subpix); } if (mbmi->second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg4x4); - vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg4x4); + vp9_build_2nd_inter_predictors_b(d0, 16, &xd->subpix); + vp9_build_2nd_inter_predictors_b(d1, 16, &xd->subpix); } } } @@ -853,13 +779,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) { if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) build_inter_predictors2b(xd, d0, 8); else { - vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4); - vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4); + vp9_build_inter_predictors_b(d0, 8, &xd->subpix); + vp9_build_inter_predictors_b(d1, 8, &xd->subpix); } if (mbmi->second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4); - vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4); + vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix); + vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix); } } } diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index 89868b95ef..903bd2e86d 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -14,6 +14,8 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_onyxc_int.h" +struct subpix_fn_table; + extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd, uint8_t *dst_y, int dst_ystride, @@ -64,10 +66,10 @@ extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x, extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd); extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf); + struct subpix_fn_table *sppf); extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf); + struct subpix_fn_table *sppf); extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch); diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 762dd75c0a..9698172b2a 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -23,21 +23,6 @@ EOF } forward_decls vp9_common_forward_decls -prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" -prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" -prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" -prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" - -# At the very least, MSVC 2008 has compiler bug exhibited by this code; code -# compiles warning free but a dissassembly of generated code show bugs. To be -# on the safe side, only enabled when compiled with 'gcc'. -if [ "$CONFIG_GCC" = "yes" ]; then - specialize vp9_filter_block2d_4x4_8 sse4_1 sse2 -fi - specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2 - specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2 - specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2 - # # Dequant # @@ -86,27 +71,17 @@ specialize vp9_dequant_idct_add_uv_block_16x16 # # RECON # -prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" specialize vp9_copy_mem16x16 mmx sse2 dspr2 vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2 -prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" specialize vp9_copy_mem8x8 mmx dspr2 vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2 -prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" specialize vp9_copy_mem8x4 mmx -prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_avg_mem16x16 - -prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_avg_mem8x8 - -prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_copy_mem8x4 mmx dspr2 -vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2 - prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride" specialize vp9_recon_b @@ -287,111 +262,6 @@ specialize vp9_convolve8_avg_horiz prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8_avg_vert -prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict16x16 - -prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x8 - -prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg16x16 - -prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg8x8 - -prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg4x4 - -prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x4 - -prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict4x4 - -prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict16x16_sharp - -prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x8_sharp - -prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg16x16_sharp - -prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg8x8_sharp - -prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg4x4_sharp - -prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x4_sharp - -prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict4x4_sharp - -prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict16x16_smooth - -prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x8_smooth - -prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg16x16_smooth - -prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg8x8_smooth - -prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg4x4_smooth - -prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x4_smooth - -prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict4x4_smooth - -prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict16x16 - -prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict8x8 - -prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict_avg16x16 - -prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict_avg8x8 - -prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict8x4 - -prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict4x4 - -prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict_avg4x4 - -prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict16x16 sse2 - -prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict8x8 sse2 - -prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict_avg16x16 - -prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict_avg8x8 - -prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict8x4 - -prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict4x4 - -prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict_avg4x4 - # # dct # diff --git a/vp9/common/vp9_subpixel.h b/vp9/common/vp9_subpixel.h deleted file mode 100644 index dc4eadfb19..0000000000 --- a/vp9/common/vp9_subpixel.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_SUBPIXEL_H_ -#define VP9_COMMON_VP9_SUBPIXEL_H_ - -#define prototype_subpixel_predict(sym) \ - void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \ - uint8_t *dst, int dst_pitch) - -typedef prototype_subpixel_predict((*vp9_subpix_fn_t)); - -#endif // VP9_COMMON_VP9_SUBPIXEL_H_ diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index f09e2d78be..d233247b2b 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -11,88 +11,6 @@ #include "./vpx_config.h" #include "vpx_ports/mem.h" -#include "vp9/common/vp9_subpixel.h" - -extern const short vp9_six_tap_mmx[8][6 * 8]; - -extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr, - unsigned char *output_ptr, - int output_pitch, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_height, - unsigned int output_width); - -extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - -extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_lin, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - -extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - /////////////////////////////////////////////////////////////////////////// // the mmx function that does the bilinear filtering and var calculation // // int one pass // @@ -115,487 +33,3 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { { 16, 16, 16, 16, 112, 112, 112, 112 }, { 8, 8, 8, 8, 120, 120, 120, 120 } }; - -#if HAVE_MMX -void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict4x4_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16); - const short *hfilter, *vfilter; - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 9, 8, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch, - 8, 4, 4, 4, vfilter); -} - -void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, - fdata2 + 8, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, - fdata2 + 12, src_pixels_per_line, 1, 21, 32, - hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch, - 32, 16, 16, 16, vfilter); -} - -void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 13, 16, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 13, 16, - hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 8, 8, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, - 16, 8, 8, 8, vfilter); -} - -void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 9, 16, hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 4, 8, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, - 16, 8, 4, 8, vfilter); -} -#endif - -#if HAVE_SSE2 -void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 21, 32, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 16, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 21, 32); - vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, dst_pitch, vfilter); - } -} - -void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 13, 16, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 8, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 8, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 8, vfilter); - } -} - -void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 9, 16, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 4, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, vfilter); - } -} -#endif - -#if HAVE_SSSE3 -extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - fdata2, 16, 21, xoffset); - vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch, - 16, yoffset); - } else { - /* First-pass only */ - vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 16, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 16, yoffset); - } -} - -void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 8, 13, xoffset); - vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset); - } else { - vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 8, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 8, yoffset); - } -} - -void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 8, 9, xoffset); - vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset); - } else { - /* First-pass only */ - vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } -} - -void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict4x4_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 4, 9, xoffset); - vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset); - } else { - vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } else { - vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } -} - -void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); - - vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 23, hfilter_aligned16); - vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, - 16, hfilter_aligned16); - } else { - vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 16, vfilter_aligned16); - } - } -} - -void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); - - vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 15, hfilter_aligned16); - vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8, - hfilter_aligned16); - } else { - vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 8, vfilter_aligned16); - } - } -} - -void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); - - vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 11, hfilter_aligned16); - vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4, - hfilter_aligned16); - } else { - vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 4, vfilter_aligned16); - } - } -} -#endif diff --git a/vp9/common/x86/vp9_filter_sse2.c b/vp9/common/x86/vp9_filter_sse2.c deleted file mode 100644 index 8e02ac1975..0000000000 --- a/vp9/common/x86/vp9_filter_sse2.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> // for alignment checks -#include <emmintrin.h> // SSE2 -#include "vp9/common/vp9_filter.h" -#include "vpx_ports/emmintrin_compat.h" -#include "vpx_ports/mem.h" // for DECLARE_ALIGNED -#include "vp9_rtcd.h" - -// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is -// just a quick partial snapshot so that other can already use some -// speedup. -// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap -// filtering. -// TODO(cd): Add some comments, better variable naming. -// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum -// of positive above 128), or have higher precision filter -// coefficients. - -DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, -}; - -// Creating a macro to do more than four pixels at once to hide instruction -// latency is actually slower :-( -#define DO_FOUR_PIXELS(result, src_ptr, offset) \ - { \ - /* Do shifted load to achieve require shuffles through unpacking */ \ - const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \ - const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \ - const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \ - const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \ - const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \ - const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \ - const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \ - const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \ - /* Shit by 4 bytes through suffle to get additional shifted loads */ \ - const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \ - const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \ - const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \ - const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \ - /* multiply accumulate them */ \ - const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ - const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ - const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ - const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ - const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ - const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ - mad_all = _mm_add_epi32(mad_all, rounding); \ - result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ - } - -void vp9_filter_block2d_4x4_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - __m128i intermediateA, intermediateB, intermediateC; - - const int kInterp_Extend = 4; - - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); - - // check alignment - assert(0 == ((long)HFilter_aligned16)%16); - assert(0 == ((long)VFilter_aligned16)%16); - - { - __m128i transpose3_0; - __m128i transpose3_1; - __m128i transpose3_2; - __m128i transpose3_3; - - // Horizontal pass (src -> intermediate). - { - const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - - { - __m128i mad_all0; - __m128i mad_all1; - __m128i mad_all2; - __m128i mad_all3; - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateA = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateB = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); - intermediateC = _mm_packus_epi16(mad_all0, mad_all2); - } - } - - // Transpose result (intermediate -> transpose3_x) - { - // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 - // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 - // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx - const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB); - const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB); - const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC); - const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC); - // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73 - // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx - // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx - const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3); - const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3); - // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63 - // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73 - // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx - // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx - const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1); - const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1); - const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3); - const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx - // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx - transpose3_0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose2_2), - _MM_SHUFFLE(1, 0, 1, 0))); - transpose3_1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose2_2), - _MM_SHUFFLE(3, 2, 3, 2))); - transpose3_2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose2_3), - _MM_SHUFFLE(1, 0, 1, 0))); - transpose3_3 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose2_3), - _MM_SHUFFLE(3, 2, 3, 2))); - // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx - // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx - // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx - // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx - } - - // Vertical pass (transpose3_x -> dst). - { - const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i col0, col1, col2, col3; - DECLARE_ALIGNED(16, unsigned char, temp[32]); - { - _mm_store_si128((__m128i *)temp, transpose3_0); - DO_FOUR_PIXELS(col0, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_1); - DO_FOUR_PIXELS(col1, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_2); - DO_FOUR_PIXELS(col2, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_3); - DO_FOUR_PIXELS(col3, temp, 0); - } - // transpose - { - __m128i T0 = _mm_unpacklo_epi32(col0, col1); - __m128i T1 = _mm_unpacklo_epi32(col2, col3); - __m128i T2 = _mm_unpackhi_epi32(col0, col1); - __m128i T3 = _mm_unpackhi_epi32(col2, col3); - col0 = _mm_unpacklo_epi64(T0, T1); - col1 = _mm_unpackhi_epi64(T0, T1); - col2 = _mm_unpacklo_epi64(T2, T3); - col3 = _mm_unpackhi_epi64(T2, T3); - } - // saturate to 8 bit - { - col0 = _mm_packs_epi32(col0, col0); - col0 = _mm_packus_epi16(col0, col0); - col1 = _mm_packs_epi32(col1, col1); - col1 = _mm_packus_epi16(col1, col1); - col2 = _mm_packs_epi32 (col2, col2); - col2 = _mm_packus_epi16(col2, col2); - col3 = _mm_packs_epi32 (col3, col3); - col3 = _mm_packus_epi16(col3, col3); - } - // store - { - *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0); - *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1); - *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2); - *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3); - } - } - } -} - -void vp9_filter_block2d_8x4_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int j; - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j, dst_stride); - } -} - -void vp9_filter_block2d_8x8_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<8; i+=4) { - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} - -void vp9_filter_block2d_16x16_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<16; i+=4) { - for (j=0; j<16; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} diff --git a/vp9/common/x86/vp9_filter_sse4.c b/vp9/common/x86/vp9_filter_sse4.c deleted file mode 100644 index 52c35b2968..0000000000 --- a/vp9/common/x86/vp9_filter_sse4.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> // for alignment checks -#include <smmintrin.h> // SSE4.1 -#include "vp9/common/vp9_filter.h" -#include "vpx_ports/mem.h" // for DECLARE_ALIGNED -#include "vp9_rtcd.h" - -// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is -// just a quick partial snapshot so that other can already use some -// speedup. -// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap -// filtering. -// TODO(cd): Reduce source size by using macros instead of current code -// duplication. -// TODO(cd): Add some comments, better variable naming. -// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum -// of positive above 128), or have higher precision filter -// coefficients. - -DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = { - 0x00, 0x01, - 0x01, 0x02, - 0x02, 0x03, - 0x03, 0x04, - 0x02, 0x03, - 0x03, 0x04, - 0x04, 0x05, - 0x05, 0x06, -}; -DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = { - 0x04, 0x05, - 0x05, 0x06, - 0x06, 0x07, - 0x07, 0x08, - 0x06, 0x07, - 0x07, 0x08, - 0x08, 0x09, - 0x09, 0x0A, -}; -DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, -}; -DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = { - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 -}; - -// Creating a macro to do more than four pixels at once to hide instruction -// latency is actually slower :-( -#define DO_FOUR_PIXELS(result, offset) \ - { \ - /*load pixels*/ \ - __m128i src = _mm_loadu_si128((const __m128i *)(src_ptr + offset)); \ - /* extract the ones used for first column */ \ - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); \ - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); \ - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); \ - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); \ - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); \ - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); \ - /* multiply accumulate them */ \ - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ - __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ - __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ - mad_all = _mm_add_epi32(mad_all, rounding); \ - result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ - } - -void vp9_filter_block2d_4x4_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - __m128i intermediateA, intermediateB, intermediateC; - - const int kInterp_Extend = 4; - - const __m128i zero = _mm_set1_epi16(0); - const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c); - const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c); - const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); - const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c); - - // check alignment - assert(0 == ((long)HFilter_aligned16)%16); - assert(0 == ((long)VFilter_aligned16)%16); - - { - __m128i transpose3_0; - __m128i transpose3_1; - __m128i transpose3_2; - __m128i transpose3_3; - - // Horizontal pass (src -> intermediate). - { - const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - - { - __m128i mad_all0; - __m128i mad_all1; - __m128i mad_all2; - __m128i mad_all3; - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateA = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateB = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); - intermediateC = _mm_packus_epi16(mad_all0, mad_all2); - } - } - - // Transpose result (intermediate -> transpose3_x) - { - // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 - // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 - // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx - const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose); - const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose); - const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx - const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1); - const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - transpose3_0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(0, 0, 1, 0))); - transpose3_1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(1, 1, 3, 2))); - transpose3_2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(2, 2, 1, 0))); - transpose3_3 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(3, 3, 3, 2))); - // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx - // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx - // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx - // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx - } - - // Vertical pass (transpose3_x -> dst). - { - const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i col0, col1, col2, col3; - { - //load pixels - __m128i src = transpose3_0; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col0 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_1; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col1 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_2; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col2 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_3; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col3 = _mm_packus_epi16(mad_all, mad_all); - } - { - __m128i col01 = _mm_unpacklo_epi8(col0, col1); - __m128i col23 = _mm_unpacklo_epi8(col2, col3); - __m128i col0123 = _mm_unpacklo_epi16(col01, col23); - //TODO(cd): look into Ronald's comment: - // Future suggestion: I believe here, too, you can merge the - // packs_epi32() and pacus_epi16() for the 4 cols above, so that - // you get the data in a single register, and then use pshufb - // (shuffle_epi8()) instead of the unpacks here. Should be - // 2+3+2 instructions faster. - *((unsigned int *)&dst_ptr[dst_stride * 0]) = - _mm_extract_epi32(col0123, 0); - *((unsigned int *)&dst_ptr[dst_stride * 1]) = - _mm_extract_epi32(col0123, 1); - *((unsigned int *)&dst_ptr[dst_stride * 2]) = - _mm_extract_epi32(col0123, 2); - *((unsigned int *)&dst_ptr[dst_stride * 3]) = - _mm_extract_epi32(col0123, 3); - } - } - } -} - -void vp9_filter_block2d_8x4_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int j; - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j, dst_stride); - } -} - -void vp9_filter_block2d_8x8_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<8; i+=4) { - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} - -void vp9_filter_block2d_16x16_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<16; i+=4) { - for (j=0; j<16; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm deleted file mode 100644 index c6d65e9043..0000000000 --- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ /dev/null @@ -1,550 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -; -; This is an implementation of some of the SSE optimizations first seen in ffvp8 -; -;*************************************************************************************/ - -;void vp9_filter_block1d8_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE -sym(vp9_filter_block1d8_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - lea rbx, [rdx + rdx*4] - add rbx, rdx ;pitch * 6 - -.vp9_filter_block1d8_v8_ssse3_loop: - movq xmm0, [rsi] ;A - movq xmm1, [rsi + rdx] ;B - movq xmm2, [rsi + rdx * 2] ;C - movq xmm3, [rax + rdx * 2] ;D - movq xmm4, [rsi + rdx * 4] ;E - movq xmm5, [rax + rdx * 4] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - movq xmm6, [rsi + rbx] ;G - movq xmm7, [rax + rbx] ;H - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - punpcklbw xmm6, xmm7 ;G H - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - paddsw xmm0, xmm2 - paddsw xmm0, krd - paddsw xmm4, xmm6 - paddsw xmm0, xmm4 - - psraw xmm0, 7 - packuswb xmm0, xmm0 - - add rsi, rdx - add rax, rdx - - movq [rdi], xmm0 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v8_ssse3_loop - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d16_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE -sym(vp9_filter_block1d16_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - lea rbx, [rdx + rdx*4] - add rbx, rdx ;pitch * 6 - -.vp9_filter_block1d16_v8_ssse3_loop: - movq xmm0, [rsi] ;A - movq xmm1, [rsi + rdx] ;B - movq xmm2, [rsi + rdx * 2] ;C - movq xmm3, [rax + rdx * 2] ;D - movq xmm4, [rsi + rdx * 4] ;E - movq xmm5, [rax + rdx * 4] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - movq xmm6, [rsi + rbx] ;G - movq xmm7, [rax + rbx] ;H - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - punpcklbw xmm6, xmm7 ;G H - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - paddsw xmm0, xmm2 - paddsw xmm0, krd - paddsw xmm4, xmm6 - paddsw xmm0, xmm4 - - psraw xmm0, 7 - packuswb xmm0, xmm0 - - movq [rdi], xmm0 - - movq xmm0, [rsi + 8] ;A - movq xmm1, [rsi + rdx + 8] ;B - movq xmm2, [rsi + rdx * 2 + 8] ;C - movq xmm3, [rax + rdx * 2 + 8] ;D - movq xmm4, [rsi + rdx * 4 + 8] ;E - movq xmm5, [rax + rdx * 4 + 8] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - - movq xmm6, [rsi + rbx + 8] ;G - movq xmm7, [rax + rbx + 8] ;H - punpcklbw xmm6, xmm7 ;G H - - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - paddsw xmm0, xmm2 - paddsw xmm4, xmm6 - paddsw xmm0, krd - paddsw xmm0, xmm4 - - psraw xmm0, 7 - packuswb xmm0, xmm0 - - add rsi, rdx - add rax, rdx - - movq [rdi+8], xmm0 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v8_ssse3_loop - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d8_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE -sym(vp9_filter_block1d8_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 -; movdqa krd, xmm5 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rdx, dword ptr arg(3) ;output_pitch - movsxd rcx, dword ptr arg(4) ;output_height - -.filter_block1d8_h8_rowloop_ssse3: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 - -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 - punpcklqdq xmm0, xmm3 - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - paddsw xmm0, xmm4 - psraw xmm0, 7 - packuswb xmm0, xmm0 - - lea rsi, [rsi + rax] - movq [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d8_h8_rowloop_ssse3 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d16_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE -sym(vp9_filter_block1d16_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rdx, dword ptr arg(3) ;output_pitch - movsxd rcx, dword ptr arg(4) ;output_height - -.filter_block1d16_h8_rowloop_ssse3: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 - -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 - punpcklqdq xmm0, xmm3 - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 - paddsw xmm0, xmm2 - paddsw xmm0, krd - psraw xmm0, 7 - packuswb xmm0, xmm0 - - - movq xmm3, [rsi + 5] -; movq xmm7, [rsi + 12] - movq xmm7, [rsi + 13] -;note: same as above -; punpcklbw xmm3, xmm7 - punpcklqdq xmm3, xmm7 - - movdqa xmm1, xmm3 - pshufb xmm3, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm3, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm3, xmm1 - paddsw xmm3, xmm2 - paddsw xmm3, krd - paddsw xmm3, xmm4 - psraw xmm3, 7 - packuswb xmm3, xmm3 - punpcklqdq xmm0, xmm3 - - lea rsi, [rsi + rax] - movdqa [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d16_h8_rowloop_ssse3 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -shuf_t0t1: - db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -align 16 -shuf_t2t3: - db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -align 16 -shuf_t4t5: - db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -align 16 -shuf_t6t7: - db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 diff --git a/vp9/common/x86/vp9_subpixel_mmx.asm b/vp9/common/x86/vp9_subpixel_mmx.asm deleted file mode 100644 index dee29b8fbb..0000000000 --- a/vp9/common/x86/vp9_subpixel_mmx.asm +++ /dev/null @@ -1,268 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - - -%define BLOCK_HEIGHT_WIDTH 4 -%define vp9_filter_weight 128 -%define VP9_FILTER_SHIFT 7 - - -;void vp9_filter_block1d_h6_mmx -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -global sym(vp9_filter_block1d_h6_mmx) PRIVATE -sym(vp9_filter_block1d_h6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - - movq mm1, [rdx + 16] ; do both the negative taps first!!! - movq mm2, [rdx + 32] ; - movq mm6, [rdx + 48] ; - movq mm7, [rdx + 64] ; - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - -.nextrow: - movq mm3, [rsi-2] ; mm3 = p-2..p5 - movq mm4, mm3 ; mm4 = p-2..p5 - psrlq mm3, 8 ; mm3 = p-1..p5 - punpcklbw mm3, mm0 ; mm3 = p-1..p2 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - movq mm5, mm4 ; mm5 = p-2..p5 - punpckhbw mm4, mm0 ; mm5 = p2..p5 - pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - movq mm4, mm5 ; mm4 = p-2..p5; - psrlq mm5, 16 ; mm5 = p0..p5; - punpcklbw mm5, mm0 ; mm5 = p0..p3 - pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - movq mm5, mm4 ; mm5 = p-2..p5 - psrlq mm4, 24 ; mm4 = p1..p5 - punpcklbw mm4, mm0 ; mm4 = p1..p4 - pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - ; do outer positive taps - movd mm4, [rsi+3] - punpcklbw mm4, mm0 ; mm5 = p3..p6 - pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - punpcklbw mm5, mm0 ; mm5 = p-2..p1 - pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - paddsw mm3, [GLOBAL(rd)] ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and unpack to saturate - punpcklbw mm3, mm0 ; - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line - add rdi, rax; -%else - movsxd r8, dword ptr arg(2) ;src_pixels_per_line - add rdi, rax; - - add rsi, r8 ; next line -%endif - - dec rcx ; decrement count - jnz .nextrow ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1dc_v6_mmx -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int output_pitch, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -global sym(vp9_filter_block1dc_v6_mmx) PRIVATE -sym(vp9_filter_block1dc_v6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movq mm5, [GLOBAL(rd)] - push rbx - mov rbx, arg(7) ;vp9_filter - movq mm1, [rbx + 16] ; do both the negative taps first!!! - movq mm2, [rbx + 32] ; - movq mm6, [rbx + 48] ; - movq mm7, [rbx + 64] ; - - movsxd rdx, dword ptr arg(3) ;pixels_per_line - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - sub rsi, rdx - sub rsi, rdx - movsxd rcx, DWORD PTR arg(5) ;output_height - movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - - -.nextrow_cv: - movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 - pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 - pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi] ; mm4 = p0..p3 = row -2 - pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - add rsi, rdx ; move source forward 1 line to avoid 3 * pitch - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 - pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 - pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - paddsw mm3, mm5 ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and saturate - - movd [rdi],mm3 ; store the results in the destination - ; the subsequent iterations repeat 3 out of 4 of these reads. Since the - ; recon block should be in cache this shouldn't cost much. Its obviously - ; avoidable!!!. - lea rdi, [rdi+rax] ; - dec rcx ; decrement count - jnz .nextrow_cv ; next row - - pop rbx - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -rd: - times 4 dw 0x40 - -align 16 -global HIDDEN_DATA(sym(vp9_six_tap_mmx)) -sym(vp9_six_tap_mmx): - times 8 dw 0 - times 8 dw 0 - times 8 dw 128 - times 8 dw 0 - times 8 dw 0 - times 8 dw 0 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 123 - times 8 dw 12 - times 8 dw -1 - times 8 dw 0 - - times 8 dw 2 - times 8 dw -11 - times 8 dw 108 - times 8 dw 36 - times 8 dw -8 - times 8 dw 1 - - times 8 dw 0 - times 8 dw -9 - times 8 dw 93 - times 8 dw 50 - times 8 dw -6 - times 8 dw 0 - - times 8 dw 3 - times 8 dw -16 - times 8 dw 77 - times 8 dw 77 - times 8 dw -16 - times 8 dw 3 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 50 - times 8 dw 93 - times 8 dw -9 - times 8 dw 0 - - times 8 dw 1 - times 8 dw -8 - times 8 dw 36 - times 8 dw 108 - times 8 dw -11 - times 8 dw 2 - - times 8 dw 0 - times 8 dw -1 - times 8 dw 12 - times 8 dw 123 - times 8 dw -6 - times 8 dw 0 - diff --git a/vp9/common/x86/vp9_subpixel_sse2.asm b/vp9/common/x86/vp9_subpixel_sse2.asm deleted file mode 100644 index b0c4f12825..0000000000 --- a/vp9/common/x86/vp9_subpixel_sse2.asm +++ /dev/null @@ -1,1372 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -;void vp9_filter_block1d8_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp9_filter -;) -global sym(vp9_filter_block1d8_h6_sse2) PRIVATE -sym(vp9_filter_block1d8_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -global sym(vp9_filter_block1d16_h6_sse2) PRIVATE -sym(vp9_filter_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi+16], xmm4 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_v6_sse2 -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp9_filter_block1d8_v6_sse2) PRIVATE -sym(vp9_filter_block1d8_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp9_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp9_filter_block1d8_v6_sse2_loop: - movdqa xmm1, XMMWORD PTR [rsi] - pmullw xmm1, [rax] - - movdqa xmm2, XMMWORD PTR [rsi + rdx] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] - pmullw xmm3, [rax + 32] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] - pmullw xmm5, [rax + 64] - - add rsi, rdx - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] - - pmullw xmm4, [rax + 48] - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] - - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d8_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_v6_sse2 -;( -; unsigned short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; const short *vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp9_filter_block1d16_v6_sse2) PRIVATE -sym(vp9_filter_block1d16_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp9_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp9_filter_block1d16_v6_sse2_loop: -; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. - movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 - movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] - pmullw xmm1, [rax + 16] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm3, [rax + 64] - pmullw xmm4, [rax + 64] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm5, [rax + 32] - pmullw xmm6, [rax + 32] - - movdqa xmm7, XMMWORD PTR [rsi] ; line 1 - movdqa xmm0, XMMWORD PTR [rsi + 16] - pmullw xmm7, [rax] - pmullw xmm0, [rax] - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - paddsw xmm1, xmm7 - paddsw xmm2, xmm0 - - add rsi, rdx - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm3, [rax + 48] - pmullw xmm4, [rax + 48] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm5, [rax + 80] - pmullw xmm6, [rax + 80] - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] - pxor xmm0, xmm0 ; clear xmm0 - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - - paddsw xmm1, xmm7 - paddsw xmm2, xmm7 - - psraw xmm1, 7 - psraw xmm2, 7 - - packuswb xmm1, xmm2 ; pack and saturate - movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d16_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE -sym(vp9_filter_block1d8_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_only_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - - movq QWORD PTR [rdi], xmm4 ; store the results in the destination - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_only_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE -sym(vp9_filter_block1d16_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_only_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; lower 8 bytes - - movq QWORD Ptr [rdi], xmm4 ; store the results in the destination - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; higher 8 bytes - - movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_only_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_v6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; Second-pass filter only when xoffset==0 -global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE -sym(vp9_filter_block1d8_v6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - mov rax, arg(5) ;vp9_filter - - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ; dst_ptich -%endif - -.vp9_filter_block1d8_v6_only_sse2_loop: - movq xmm1, MMWORD PTR [rsi] - movq xmm2, MMWORD PTR [rsi + rdx] - movq xmm3, MMWORD PTR [rsi + rdx * 2] - movq xmm5, MMWORD PTR [rsi + rdx * 4] - add rsi, rdx - movq xmm4, MMWORD PTR [rsi + rdx * 2] - movq xmm6, MMWORD PTR [rsi + rdx * 4] - - punpcklbw xmm1, xmm0 - pmullw xmm1, [rax] - - punpcklbw xmm2, xmm0 - pmullw xmm2, [rax + 16] - - punpcklbw xmm3, xmm0 - pmullw xmm3, [rax + 32] - - punpcklbw xmm5, xmm0 - pmullw xmm5, [rax + 64] - - punpcklbw xmm4, xmm0 - pmullw xmm4, [rax + 48] - - punpcklbw xmm6, xmm0 - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_unpack_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int output_height, -; unsigned int output_width -;) -global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE -sym(vp9_unpack_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(3) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source - - pxor xmm0, xmm0 ; clear xmm0 for unpack -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source -%endif - -.unpack_block1d16_h6_sse2_rowloop: - movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 - movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - punpcklbw xmm1, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm1 - movdqa XMMWORD Ptr [rdi + 16], xmm3 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(4) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - jnz .unpack_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_bilinear_predict16x16_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp9_bilinear_filters_mmx) -global sym(vp9_bilinear_predict16x16_sse2) PRIVATE -sym(vp9_bilinear_predict16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] - - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] - movsxd rax, dword ptr arg(2) ;xoffset - - cmp rax, 0 ;skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - - cmp rax, 0 ;skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;dst_pitch -%endif - ; get the first horizontal line done - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - add rsi, rdx ; next line -.next_row: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, [rax] - pmullw xmm6, [rax] - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - pmullw xmm3, [rax+16] - pmullw xmm4, [rax+16] - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rdx ; next line -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ;dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - add rsi, rax ; next line -.next_row_spo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - movdqa xmm4, xmm3 ; make a copy of current line - movdqa xmm7, xmm3 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm5, xmm1 - pmullw xmm6, xmm1 - pmullw xmm3, xmm2 - pmullw xmm4, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ;dst_pitch - cmp rdi, rcx - jne .next_row_spo - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - pxor xmm0, xmm0 - -.next_row_fpo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ; dst_pitch - cmp rdi, rcx - jne .next_row_fpo - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_bilinear_predict8x8_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp9_bilinear_filters_mmx) -global sym(vp9_bilinear_predict8x8_sse2) PRIVATE -sym(vp9_bilinear_predict8x8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ;xoffset - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm5, [rax] - movdqa xmm6, [rax+16] - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqa xmm3, XMMWORD PTR [rsp] - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - add rsp, 16 ; next line -.next_row8x8: - movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - pmullw xmm7, xmm5 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm4, xmm3 - - pmullw xmm3, xmm6 - paddw xmm3, xmm7 - - movdqa xmm7, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - packuswb xmm3, xmm0 - movq [rdi], xmm3 ; store the results in the destination - - add rsp, 16 ; next line - add rdi, rdx - - cmp rdi, rcx - jne .next_row8x8 - - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -rd: - times 8 dw 0x40 diff --git a/vp9/common/x86/vp9_subpixel_ssse3.asm b/vp9/common/x86/vp9_subpixel_ssse3.asm deleted file mode 100644 index b260480e03..0000000000 --- a/vp9/common/x86/vp9_subpixel_ssse3.asm +++ /dev/null @@ -1,1515 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -; -; This is an implementation of some of the SSE optimizations first seen in ffvp8 -; -;*************************************************************************************/ -;void vp9_filter_block1d8_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE -sym(vp9_filter_block1d8_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 - - movdqa xmm7, [GLOBAL(rd)] - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - mov rdi, arg(2) ;output_ptr - - cmp esi, DWORD PTR [rax] - je vp9_filter_block1d8_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx -;xmm3 free -.filter_block1d8_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - pmaddubsw xmm1, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm1 - paddsw xmm2, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - jnz .filter_block1d8_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -vp9_filter_block1d8_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] - movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] - - mov rsi, arg(0) ;src_ptr - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx - -.filter_block1d8_h4_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm2, xmm0 - pshufb xmm0, xmm3 - - pshufb xmm2, xmm4 - pmaddubsw xmm0, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - - jnz .filter_block1d8_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp9_filter_block1d16_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE -sym(vp9_filter_block1d16_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - mov rdi, arg(2) ;output_ptr - - mov rsi, arg(0) ;src_ptr - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d16_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - movq xmm3, MMWORD PTR [rsi + 6] - - pmaddubsw xmm1, xmm5 - movq xmm7, MMWORD PTR [rsi + 11] - - pmaddubsw xmm2, xmm6 - punpcklbw xmm3, xmm7 - - paddsw xmm0, xmm1 - movdqa xmm1, xmm3 - - pmaddubsw xmm3, xmm4 - paddsw xmm0, xmm2 - - movdqa xmm2, xmm1 - paddsw xmm0, [GLOBAL(rd)] - - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - - psraw xmm0, 7 - pmaddubsw xmm1, xmm5 - - pmaddubsw xmm2, xmm6 - packuswb xmm0, xmm0 - - lea rsi, [rsi + rax] - paddsw xmm3, xmm1 - - paddsw xmm3, xmm2 - - paddsw xmm3, [GLOBAL(rd)] - - psraw xmm3, 7 - - packuswb xmm3, xmm3 - - punpcklqdq xmm0, xmm3 - - movdqa XMMWORD Ptr [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d16_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d4_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE -sym(vp9_filter_block1d4_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - movdqa xmm7, [GLOBAL(rd)] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d4_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -;xmm3 free -.filter_block1d4_h6_rowloop_ssse3: - movdqu xmm0, XMMWORD PTR [rsi - 2] - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf1b)] - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2b)] - pmaddubsw xmm0, xmm4 - pshufb xmm2, [GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm0, xmm1 - paddsw xmm0, xmm7 - pxor xmm1, xmm1 - paddsw xmm0, xmm2 - psraw xmm0, 7 - packuswb xmm0, xmm0 - - movd DWORD PTR [rdi], xmm0 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d4_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d4_h4_rowloop_ssse3: - movdqu xmm1, XMMWORD PTR [rsi - 2] - - movdqa xmm2, xmm1 - pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] - pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm1, xmm7 - paddsw xmm1, xmm2 - psraw xmm1, 7 - packuswb xmm1, xmm1 - - movd DWORD PTR [rdi], xmm1 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - -;void vp9_filter_block1d16_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE -sym(vp9_filter_block1d16_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d16_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - -.vp9_filter_block1d16_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 ;store the results - - movq xmm1, MMWORD PTR [rsi + 8] ;A - movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi+8], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d16_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - -.vp9_filter_block1d16_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - paddsw xmm2, [GLOBAL(rd)] - paddsw xmm2, xmm3 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - punpcklbw xmm5, xmm4 ;B D - punpcklbw xmm1, xmm0 ;C E - - pmaddubsw xmm1, xmm6 - pmaddubsw xmm5, xmm7 - - movdqa xmm4, [GLOBAL(rd)] - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm5, xmm1 - paddsw xmm5, xmm4 - psraw xmm5, 7 - packuswb xmm5, xmm5 - - punpcklqdq xmm2, xmm5 - - movdqa XMMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d8_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE -sym(vp9_filter_block1d8_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d8_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d8_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - movdqa xmm4, [GLOBAL(rd)] - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d8_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm5, [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d8_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm5 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp9_filter_block1d4_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE -sym(vp9_filter_block1d4_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d4_v4_ssse3 - - movq mm5, MMWORD PTR [rax] ;k0_k5 - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d4_v6_ssse3_loop: - movd mm1, DWORD PTR [rsi] ;A - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - movd mm0, DWORD PTR [rax + rdx * 4] ;F - - movq mm4, [GLOBAL(rd)] - - pmaddubsw mm3, mm6 - punpcklbw mm1, mm0 ;A F - pmaddubsw mm2, mm7 - pmaddubsw mm1, mm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm1 - paddsw mm2, mm4 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d4_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d4_v4_ssse3: - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - movq mm5, MMWORD PTR [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d4_v4_ssse3_loop: - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - pmaddubsw mm3, mm6 - pmaddubsw mm2, mm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm5 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d4_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_bilinear_predict16x16_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE -sym(vp9_bilinear_predict16x16_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - movsxd rax, dword ptr arg(2) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 4 - lea rax, [rax + rcx] ; HFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ; src_pixels_per_line - - movdqa xmm2, [rax] - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ; dst_pitch -%endif - movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 - - punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 - pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm6, xmm5 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm6, xmm1 - - punpcklbw xmm4, xmm5 - pmaddubsw xmm4, xmm1 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 - - packuswb xmm6, xmm4 - movdqa xmm5, xmm7 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm2 - - punpckhbw xmm7, xmm6 - pmaddubsw xmm7, xmm2 - - paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value - psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm5, xmm7 - movdqa xmm7, xmm6 - - movdqa [rdi], xmm5 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ; dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - - ; get the first horizontal line done - movq xmm4, [rsi] ; load row 0 - movq xmm2, [rsi + 8] ; load row 0 - - lea rsi, [rsi + rax] ; next line -.next_row_sp: - movq xmm3, [rsi] ; load row + 1 - movq xmm5, [rsi + 8] ; load row + 1 - - punpcklbw xmm4, xmm3 - punpcklbw xmm2, xmm5 - - pmaddubsw xmm4, xmm1 - movq xmm7, [rsi + rax] ; load row + 2 - - pmaddubsw xmm2, xmm1 - movq xmm6, [rsi + rax + 8] ; load row + 2 - - punpcklbw xmm3, xmm7 - punpcklbw xmm5, xmm6 - - pmaddubsw xmm3, xmm1 - paddw xmm4, [GLOBAL(rd)] - - pmaddubsw xmm5, xmm1 - paddw xmm2, [GLOBAL(rd)] - - psraw xmm4, VP9_FILTER_SHIFT - psraw xmm2, VP9_FILTER_SHIFT - - packuswb xmm4, xmm2 - paddw xmm3, [GLOBAL(rd)] - - movdqa [rdi], xmm4 ; store row 0 - paddw xmm5, [GLOBAL(rd)] - - psraw xmm3, VP9_FILTER_SHIFT - psraw xmm5, VP9_FILTER_SHIFT - - packuswb xmm3, xmm5 - movdqa xmm4, xmm7 - - movdqa [rdi + rdx],xmm3 ; store row 1 - lea rsi, [rsi + 2*rax] - - movdqa xmm2, xmm6 - lea rdi, [rdi + 2*rdx] - - cmp rdi, rcx - jne .next_row_sp - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - -.next_row_fp: - movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm2, xmm4 - movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 - - pmaddubsw xmm2, xmm1 - movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rax] ; next line - punpcklbw xmm3, xmm4 - - pmaddubsw xmm3, xmm1 - movq xmm5, [rsi] - - paddw xmm2, [GLOBAL(rd)] - movq xmm7, [rsi+1] - - movq xmm6, [rsi+8] - psraw xmm2, VP9_FILTER_SHIFT - - punpcklbw xmm5, xmm7 - movq xmm7, [rsi+9] - - paddw xmm3, [GLOBAL(rd)] - pmaddubsw xmm5, xmm1 - - psraw xmm3, VP9_FILTER_SHIFT - punpcklbw xmm6, xmm7 - - packuswb xmm2, xmm3 - pmaddubsw xmm6, xmm1 - - movdqa [rdi], xmm2 ; store the results in the destination - paddw xmm5, [GLOBAL(rd)] - - lea rdi, [rdi + rdx] ; dst_pitch - psraw xmm5, VP9_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm6, VP9_FILTER_SHIFT - - packuswb xmm5, xmm6 - lea rsi, [rsi + rax] ; next line - - movdqa [rdi], xmm5 ; store the results in the destination - lea rdi, [rdi + rdx] ; dst_pitch - - cmp rdi, rcx - - jne .next_row_fp - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_bilinear_predict8x8_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE -sym(vp9_bilinear_predict8x8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ; xoffset - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b8x8_sp_only - - shl rax, 4 - add rax, rcx ; HFilter - - mov rdi, arg(4) ; dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b8x8_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm1, [rax] - - ; get the first horizontal line done - movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx - - psrldq xmm5, 1 - lea rsp, [rsp + 16] ; next line - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - lea rsp, [rsp + 16] ; next line - - movdqa xmm5, xmm6 - - psrldq xmm5, 1 - - punpcklbw xmm6, xmm5 - pmaddubsw xmm6, xmm0 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 - - packuswb xmm6, xmm6 - - punpcklbw xmm7, xmm6 - pmaddubsw xmm7, xmm1 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm7, xmm7 - - movq [rdi], xmm7 ; store the results in the destination - lea rdi, [rdi + rdx] - - movdqa xmm7, xmm6 - - cmp rdi, rcx - jne .next_row - - jmp .done8x8 - -.b8x8_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] ; VFilter - - movq xmm1, XMMWORD PTR [rsp] - movq xmm2, XMMWORD PTR [rsp+16] - - movq xmm3, XMMWORD PTR [rsp+32] - punpcklbw xmm1, xmm2 - - movq xmm4, XMMWORD PTR [rsp+48] - punpcklbw xmm2, xmm3 - - movq xmm5, XMMWORD PTR [rsp+64] - punpcklbw xmm3, xmm4 - - movq xmm6, XMMWORD PTR [rsp+80] - punpcklbw xmm4, xmm5 - - movq xmm7, XMMWORD PTR [rsp+96] - punpcklbw xmm5, xmm6 - - pmaddubsw xmm1, xmm0 - pmaddubsw xmm2, xmm0 - - pmaddubsw xmm3, xmm0 - pmaddubsw xmm4, xmm0 - - pmaddubsw xmm5, xmm0 - punpcklbw xmm6, xmm7 - - pmaddubsw xmm6, xmm0 - paddw xmm1, [GLOBAL(rd)] - - paddw xmm2, [GLOBAL(rd)] - psraw xmm1, VP9_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm2, VP9_FILTER_SHIFT - - paddw xmm4, [GLOBAL(rd)] - psraw xmm3, VP9_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm5, VP9_FILTER_SHIFT - - psraw xmm6, VP9_FILTER_SHIFT - packuswb xmm1, xmm1 - - packuswb xmm2, xmm2 - movq [rdi], xmm1 - - packuswb xmm3, xmm3 - movq [rdi+rdx], xmm2 - - packuswb xmm4, xmm4 - movq xmm1, XMMWORD PTR [rsp+112] - - lea rdi, [rdi + 2*rdx] - movq xmm2, XMMWORD PTR [rsp+128] - - packuswb xmm5, xmm5 - movq [rdi], xmm3 - - packuswb xmm6, xmm6 - movq [rdi+rdx], xmm4 - - lea rdi, [rdi + 2*rdx] - punpcklbw xmm7, xmm1 - - movq [rdi], xmm5 - pmaddubsw xmm7, xmm0 - - movq [rdi+rdx], xmm6 - punpcklbw xmm1, xmm2 - - pmaddubsw xmm1, xmm0 - paddw xmm7, [GLOBAL(rd)] - - psraw xmm7, VP9_FILTER_SHIFT - paddw xmm1, [GLOBAL(rd)] - - psraw xmm1, VP9_FILTER_SHIFT - packuswb xmm7, xmm7 - - packuswb xmm1, xmm1 - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm7 - - movq [rdi+rdx], xmm1 - lea rsp, [rsp + 144] - - jmp .done8x8 - -.b8x8_fp_only: - lea rcx, [rdi+rdx*8] - -.next_row_fp: - movdqa xmm1, XMMWORD PTR [rsp] - movdqa xmm3, XMMWORD PTR [rsp+16] - - movdqa xmm2, xmm1 - movdqa xmm5, XMMWORD PTR [rsp+32] - - psrldq xmm2, 1 - movdqa xmm7, XMMWORD PTR [rsp+48] - - movdqa xmm4, xmm3 - psrldq xmm4, 1 - - movdqa xmm6, xmm5 - psrldq xmm6, 1 - - punpcklbw xmm1, xmm2 - pmaddubsw xmm1, xmm0 - - punpcklbw xmm3, xmm4 - pmaddubsw xmm3, xmm0 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm0 - - movdqa xmm2, xmm7 - psrldq xmm2, 1 - - punpcklbw xmm7, xmm2 - pmaddubsw xmm7, xmm0 - - paddw xmm1, [GLOBAL(rd)] - psraw xmm1, VP9_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm3, VP9_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm5, VP9_FILTER_SHIFT - - paddw xmm7, [GLOBAL(rd)] - psraw xmm7, VP9_FILTER_SHIFT - - packuswb xmm1, xmm1 - packuswb xmm3, xmm3 - - packuswb xmm5, xmm5 - movq [rdi], xmm1 - - packuswb xmm7, xmm7 - movq [rdi+rdx], xmm3 - - lea rdi, [rdi + 2*rdx] - movq [rdi], xmm5 - - lea rsp, [rsp + 4*16] - movq [rdi+rdx], xmm7 - - lea rdi, [rdi + 2*rdx] - cmp rdi, rcx - - jne .next_row_fp - - lea rsp, [rsp + 16] - -.done8x8: - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -shuf1b: - db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 -shuf2b: - db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 -shuf3b: - db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 - -align 16 -shuf2bfrom1: - db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 -align 16 -shuf3bfrom1: - db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 - -align 16 -rd: - times 8 dw 0x40 - -align 16 -k0_k5: - times 8 db 0, 0 ;placeholder - times 8 db 0, 0 - times 8 db 2, 1 - times 8 db 0, 0 - times 8 db 3, 3 - times 8 db 0, 0 - times 8 db 1, 2 - times 8 db 0, 0 -k1_k3: - times 8 db 0, 0 ;placeholder - times 8 db -6, 12 - times 8 db -11, 36 - times 8 db -9, 50 - times 8 db -16, 77 - times 8 db -6, 93 - times 8 db -8, 108 - times 8 db -1, 123 -k2_k4: - times 8 db 128, 0 ;placeholder - times 8 db 123, -1 - times 8 db 108, -8 - times 8 db 93, -6 - times 8 db 77, -16 - times 8 db 50, -9 - times 8 db 36, -11 - times 8 db 12, -6 -align 16 -bilinear_filters_ssse3: - times 8 db 128, 0 - times 8 db 120, 8 - times 8 db 112, 16 - times 8 db 104, 24 - times 8 db 96, 32 - times 8 db 88, 40 - times 8 db 80, 48 - times 8 db 72, 56 - times 8 db 64, 64 - times 8 db 56, 72 - times 8 db 48, 80 - times 8 db 40, 88 - times 8 db 32, 96 - times 8 db 24, 104 - times 8 db 16, 112 - times 8 db 8, 120 - diff --git a/vp9/common/x86/vp9_subpixel_x86.h b/vp9/common/x86/vp9_subpixel_x86.h deleted file mode 100644 index 25bc26d9bd..0000000000 --- a/vp9/common/x86/vp9_subpixel_x86.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_ -#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_ - -/* Note: - * - * This platform is commonly built for runtime CPU detection. If you modify - * any of the function mappings present in this file, be sure to also update - * them in the function pointer initialization code - */ - -#if HAVE_MMX -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx - -#endif -#endif - - -#if HAVE_SSE2 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2); - - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2 - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2 - -#endif -#endif - -#if HAVE_SSSE3 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3 - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3 - - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3 - -#endif -#endif - - - -#endif diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 4a0794a9b2..1bbf95468f 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -11,6 +11,7 @@ #include "vpx_config.h" #include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_quantize.h" @@ -3953,6 +3954,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, cm->fb_idx_ref_cnt[cm->new_fb_idx]--; cm->new_fb_idx = get_free_fb(cm); + vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm); if (cpi->pass == 1) { Pass1Encode(cpi, size, dest, frame_flags); } else if (cpi->pass == 2) { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 2e9bbcfc1e..200a6a9473 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2222,9 +2222,9 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x, BLOCK *be = &x->block[i]; int thisdistortion; - vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict4x4); + vp9_build_inter_predictors_b(bd, 16, &xd->subpix); if (xd->mode_info_context->mbmi.second_ref_frame > 0) - vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg4x4); + vp9_build_2nd_inter_predictors_b(bd, 16, &xd->subpix); vp9_subtract_b(be, bd, 16); x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); x->quantize_b_4x4(be, bd); diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 8bbe534860..7bca01e051 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -50,12 +50,11 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, // Y yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3); - if ((mv_row | mv_col) & 7) { - xd->subpixel_predict16x16(yptr, stride, - (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16); - } else { - vp9_copy_mem16x16(yptr, stride, &pred[0], 16); - } + xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][0]( + yptr, stride, &pred[0], 16, + xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4, + 16, 16); // U & V omv_row = mv_row; @@ -67,15 +66,17 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, uptr = u_mb_ptr + offset; vptr = v_mb_ptr + offset; - if ((omv_row | omv_col) & 15) { - xd->subpixel_predict8x8(uptr, stride, - (omv_col & 15), (omv_row & 15), &pred[256], 8); - xd->subpixel_predict8x8(vptr, stride, - (omv_col & 15), (omv_row & 15), &pred[320], 8); - } else { - vp9_copy_mem8x8(uptr, stride, &pred[256], 8); - vp9_copy_mem8x8(vptr, stride, &pred[320], 8); - } + xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0]( + uptr, stride, &pred[256], 8, + xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4, + xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4, + 8, 8); + + xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0]( + vptr, stride, &pred[320], 8, + xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4, + xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4, + 8, 8); } void vp9_temporal_filter_apply_c(uint8_t *frame1, diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c index d03e285c63..d07a65b455 100644 --- a/vp9/encoder/vp9_variance_c.c +++ b/vp9/encoder/vp9_variance_c.c @@ -142,8 +142,8 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, const int16_t *HFilter, *VFilter; uint16_t FData3[5 * 4]; // Temp data bufffer used in filtering - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); // First filter 1d Horizontal var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter); @@ -166,8 +166,8 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter); @@ -186,8 +186,8 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter); @@ -206,8 +206,8 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, uint8_t temp2[68 * 64]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 65, 64, HFilter); @@ -227,8 +227,8 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, uint8_t temp2[36 * 32]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter); @@ -367,8 +367,8 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter); @@ -387,8 +387,8 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter); diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index d1805be62b..c7e8acb454 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -56,7 +56,6 @@ VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h VP9_COMMON_SRCS-yes += common/vp9_seg_common.h VP9_COMMON_SRCS-yes += common/vp9_seg_common.c VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h -VP9_COMMON_SRCS-yes += common/vp9_subpixel.h VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h VP9_COMMON_SRCS-yes += common/vp9_textblit.h @@ -81,7 +80,6 @@ VP9_COMMON_SRCS-yes += common/vp9_treecoder.c VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.h -VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_subpixel_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c @@ -90,7 +88,6 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm -VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_subpixel_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm @@ -98,10 +95,7 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm @@ -113,19 +107,10 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm endif -VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_filter_sse4.c -ifeq ($(HAVE_SSE4_1),yes) -vp9/common/x86/vp9_filter_sse4.c.o: CFLAGS += -msse4 -vp9/common/x86/vp9_filter_sse4.c.d: CFLAGS += -msse4 -endif - -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_filter_sse2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c ifeq ($(HAVE_SSE2),yes) -vp9/common/x86/vp9_filter_sse2.c.o: CFLAGS += -msse2 vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2 vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2 -vp9/common/x86/vp9_filter_sse2.c.d: CFLAGS += -msse2 vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2 vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2 endif -- GitLab