diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c index b02f3f083430cb17f27f8139ee0294f00cc16e32..79092cd0eb6020010328ae39c02c45ce4dbfcc78 100644 --- a/vp9/common/generic/vp9_systemdependent.c +++ b/vp9/common/generic/vp9_systemdependent.c @@ -11,8 +11,6 @@ #include "./vpx_config.h" #include "vp9_rtcd.h" -#include "vp9/common/vp9_subpixel.h" -#include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_onyxc_int.h" void vp9_machine_specific_config(VP9_COMMON *ctx) { diff --git a/vp9/common/ppc/vp9_systemdependent.c b/vp9/common/ppc/vp9_systemdependent.c index 106a2b763e31a35b46f588cd1bdd54f03b6d0518..02035191f37fbac5d415b9a0392c1fc4478b2ffe 100644 --- a/vp9/common/ppc/vp9_systemdependent.c +++ b/vp9/common/ppc/vp9_systemdependent.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vp9/common/vp9_subpixel.h" #include "vp9/common/vp9_loopfilter.h" #include "recon.h" #include "vp9/common/vp9_onyxc_int.h" diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index b34f308d3af58b04ce396edafe2ba96cae03c83a..241cb8a13f7a38b55a0354c4991f489e1bb64674 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -16,9 +16,9 @@ void vpx_log(const char *format, ...); #include "./vpx_config.h" #include "vpx_scale/yv12config.h" +#include "vp9/common/vp9_convolve.h" #include "vp9/common/vp9_mv.h" #include "vp9/common/vp9_treecoder.h" -#include "vp9/common/vp9_subpixel.h" #include "vpx_ports/mem.h" #include "vp9/common/vp9_common.h" @@ -393,15 +393,8 @@ typedef struct macroblockd { void (*inv_walsh4x4_1)(int16_t *in, int16_t *out); void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out); + struct subpix_fn_table subpix; - vp9_subpix_fn_t subpixel_predict4x4; - vp9_subpix_fn_t subpixel_predict8x4; - vp9_subpix_fn_t subpixel_predict8x8; - vp9_subpix_fn_t subpixel_predict16x16; - vp9_subpix_fn_t subpixel_predict_avg4x4; - vp9_subpix_fn_t subpixel_predict_avg8x4; - vp9_subpix_fn_t subpixel_predict_avg8x8; - vp9_subpix_fn_t subpixel_predict_avg16x16; int allow_high_precision_mv; int corrupted; diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c index ed188c3f25364e8ea906b69990735afa575557d9..f21f1d84e886ea997239978e2f42138df3dd2bed 100644 --- a/vp9/common/vp9_convolve.c +++ b/vp9/common/vp9_convolve.c @@ -297,3 +297,49 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8); } + +void vp9_convolve_copy(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + if (h == 16) { + vp9_copy_mem16x16(src, src_stride, dst, dst_stride); + } else if (h == 8) { + vp9_copy_mem8x8(src, src_stride, dst, dst_stride); + } else if (w == 8) { + vp9_copy_mem8x4(src, src_stride, dst, dst_stride); + } else { + // 4x4 + int r; + + for (r = 0; r < 4; ++r) { +#if !(CONFIG_FAST_UNALIGNED) + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; +#else + *(uint32_t *)dst = *(const uint32_t *)src; +#endif + src += src_stride; + dst += dst_stride; + } + } +} + +void vp9_convolve_avg(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = (dst[x] + src[x] + 1) >> 1; + } + src += src_stride; + dst += dst_stride; + } +} diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c index 07d8a169f6dbdd1332d1873ff27fe3d2c3c29f36..5e425895fddd58c673dce99c9ef7732d05e86570 100644 --- a/vp9/common/vp9_filter.c +++ b/vp9/common/vp9_filter.c @@ -15,23 +15,23 @@ #include "vp9_rtcd.h" #include "vp9/common/vp9_common.h" -DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = { - { 128, 0 }, - { 120, 8 }, - { 112, 16 }, - { 104, 24 }, - { 96, 32 }, - { 88, 40 }, - { 80, 48 }, - { 72, 56 }, - { 64, 64 }, - { 56, 72 }, - { 48, 80 }, - { 40, 88 }, - { 32, 96 }, - { 24, 104 }, - { 16, 112 }, - { 8, 120 } +DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { 0, 0, 0, 120, 8, 0, 0, 0 }, + { 0, 0, 0, 112, 16, 0, 0, 0 }, + { 0, 0, 0, 104, 24, 0, 0, 0 }, + { 0, 0, 0, 96, 32, 0, 0, 0 }, + { 0, 0, 0, 88, 40, 0, 0, 0 }, + { 0, 0, 0, 80, 48, 0, 0, 0 }, + { 0, 0, 0, 72, 56, 0, 0, 0 }, + { 0, 0, 0, 64, 64, 0, 0, 0 }, + { 0, 0, 0, 56, 72, 0, 0, 0 }, + { 0, 0, 0, 48, 80, 0, 0, 0 }, + { 0, 0, 0, 40, 88, 0, 0, 0 }, + { 0, 0, 0, 32, 96, 0, 0, 0 }, + { 0, 0, 0, 24, 104, 0, 0, 0 }, + { 0, 0, 0, 16, 112, 0, 0, 0 }, + { 0, 0, 0, 8, 120, 0, 0, 0 } }; #define FILTER_ALPHA 0 @@ -144,1072 +144,21 @@ DECLARE_ALIGNED(16, const int16_t, { 1, -2, -7, 37, 80, 28, -8, -1} }; -DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = { - {0, 0, 128, 0, 0, 0}, - {1, -5, 125, 8, -2, 1}, - {1, -8, 122, 17, -5, 1}, - {2, -11, 116, 27, -8, 2}, - {3, -14, 110, 37, -10, 2}, - {3, -15, 103, 47, -12, 2}, - {3, -16, 95, 57, -14, 3}, - {3, -16, 86, 67, -15, 3}, - {3, -16, 77, 77, -16, 3}, - {3, -15, 67, 86, -16, 3}, - {3, -14, 57, 95, -16, 3}, - {2, -12, 47, 103, -15, 3}, - {2, -10, 37, 110, -14, 3}, - {2, -8, 27, 116, -11, 2}, - {1, -5, 17, 122, -8, 1}, - {1, -2, 8, 125, -5, 1} +DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]) = { + {0, 0, 0, 128, 0, 0, 0, 0}, + {0, 1, -5, 125, 8, -2, 1, 0}, + {0, 1, -8, 122, 17, -5, 1, 0}, + {0, 2, -11, 116, 27, -8, 2, 0}, + {0, 3, -14, 110, 37, -10, 2, 0}, + {0, 3, -15, 103, 47, -12, 2, 0}, + {0, 3, -16, 95, 57, -14, 3, 0}, + {0, 3, -16, 86, 67, -15, 3, 0}, + {0, 3, -16, 77, 77, -16, 3, 0}, + {0, 3, -15, 67, 86, -16, 3, 0}, + {0, 3, -14, 57, 95, -16, 3, 0}, + {0, 2, -12, 47, 103, -15, 3, 0}, + {0, 2, -10, 37, 110, -14, 3, 0}, + {0, 2, -8, 27, 116, -11, 2, 0}, + {0, 1, -5, 17, 122, -8, 1, 0}, + {0, 1, -2, 8, 125, -5, 1, 0} }; - -static void filter_block2d_first_pass_6(uint8_t *src_ptr, - int *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr++; - } - - /* Next row... */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_width; - } -} - -static void filter_block2d_second_pass_6(int *src_ptr, - uint8_t *output_ptr, - int output_pitch, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - /* Apply filter */ - temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr++; - } - - /* Start next row */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_pitch; - } -} - -/* - * The only functional difference between filter_block2d_second_pass() - * and this function is that filter_block2d_second_pass() does a sixtap - * filter on the input and stores it in the output. This function - * (filter_block2d_second_pass_avg()) does a sixtap filter on the input, - * and then averages that with the content already present in the output - * ((filter_result + dest + 1) >> 1) and stores that in the output. - */ -static void filter_block2d_second_pass_avg_6(int *src_ptr, - uint8_t *output_ptr, - int output_pitch, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - /* Apply filter */ - temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) + - ((int)src_ptr[0] * vp9_filter[2]) + - ((int)src_ptr[pixel_step] * vp9_filter[3]) + - ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) + - ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) + - (VP9_FILTER_WEIGHT >> 1); /* Rounding */ - - /* Normalize back to 0-255 */ - output_ptr[j] = (clip_pixel(temp >> VP9_FILTER_SHIFT) + - output_ptr[j] + 1) >> 1; - src_ptr++; - } - - /* Start next row */ - src_ptr += src_pixels_per_line - output_width; - output_ptr += output_pitch; - } -} - -#define Interp_Extend 3 -static void filter_block2d_6(uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int output_pitch, - const int16_t *HFilter, - const int16_t *VFilter) { - int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr, - output_pitch, 4, 4, 4, 4, VFilter); -} - - -void vp9_sixtap_predict4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, - VFilter); -} - -/* - * The difference between filter_block2d_6() and filter_block2d_avg_6 is - * that filter_block2d_6() does a 6-tap filter and stores it in the output - * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and - * then averages that with the content already present in the output - * ((filter_result + dest + 1) >> 1) and stores that in the output. - */ -static void filter_block2d_avg_6(uint8_t *src_ptr, - uint8_t *output_ptr, - unsigned int src_pixels_per_line, - int output_pitch, - const int16_t *HFilter, - const int16_t *VFilter) { - int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr, - output_pitch, 4, 4, 4, 4, VFilter); -} - -void vp9_sixtap_predict_avg4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, - HFilter, VFilter); -} - -void vp9_sixtap_predict8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 8, 8, 8, 8, VFilter); - -} - -void vp9_sixtap_predict_avg8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 8, 8, 8, 8, VFilter); -} - -void vp9_sixtap_predict8x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 3 + Interp_Extend * 2, 8, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 8, 8, 4, 8, VFilter); -} - -void vp9_sixtap_predict16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 16, 16, 16, 16, VFilter); -} - -void vp9_sixtap_predict_avg16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */ - - HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */ - VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */ - - /* First filter 1-D horizontally... */ - filter_block2d_first_pass_6( - src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData, - src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter); - - /* then filter vertically... */ - filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr, - dst_pitch, 16, 16, 16, 16, VFilter); -} - -typedef enum { - VPX_FILTER_4x4 = 0, - VPX_FILTER_8x8 = 1, - VPX_FILTER_8x4 = 2, - VPX_FILTER_16x16 = 3, -} filter_size_t; - -static const unsigned int filter_size_to_wh[][2] = { - {4, 4}, - {8, 8}, - {8, 4}, - {16,16}, -}; - -static void filter_block2d_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter, - const int16_t *VFilter, - const filter_size_t filter_size, - uint8_t *dst_ptr, - unsigned int dst_stride) { - const unsigned int output_width = filter_size_to_wh[filter_size][0]; - const unsigned int output_height = filter_size_to_wh[filter_size][1]; - - // Between passes, we use an intermediate buffer whose height is extended to - // have enough horizontally filtered values as input for the vertical pass. - // This buffer is allocated to be big enough for the largest block type we - // support. - const int kInterp_Extend = 4; - const unsigned int intermediate_height = - (kInterp_Extend - 1) + output_height + kInterp_Extend; - - /* Size of intermediate_buffer is max_intermediate_height * filter_max_width, - * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height - * + kInterp_Extend - * = 3 + 16 + 4 - * = 23 - * and filter_max_width = 16 - */ - uint8_t intermediate_buffer[23 * 16]; - const int intermediate_next_stride = 1 - intermediate_height * output_width; - - // Horizontal pass (src -> transposed intermediate). - { - uint8_t *output_ptr = intermediate_buffer; - const int src_next_row_stride = src_stride - output_width; - unsigned int i, j; - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - for (i = 0; i < intermediate_height; i++) { - for (j = 0; j < output_width; j++) { - // Apply filter... - int temp = ((int)src_ptr[0] * HFilter[0]) + - ((int)src_ptr[1] * HFilter[1]) + - ((int)src_ptr[2] * HFilter[2]) + - ((int)src_ptr[3] * HFilter[3]) + - ((int)src_ptr[4] * HFilter[4]) + - ((int)src_ptr[5] * HFilter[5]) + - ((int)src_ptr[6] * HFilter[6]) + - ((int)src_ptr[7] * HFilter[7]) + - (VP9_FILTER_WEIGHT >> 1); // Rounding - - // Normalize back to 0-255... - *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr++; - output_ptr += intermediate_height; - } - src_ptr += src_next_row_stride; - output_ptr += intermediate_next_stride; - } - } - - // Vertical pass (transposed intermediate -> dst). - { - uint8_t *src_ptr = intermediate_buffer; - const int dst_next_row_stride = dst_stride - output_width; - unsigned int i, j; - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - // Apply filter... - int temp = ((int)src_ptr[0] * VFilter[0]) + - ((int)src_ptr[1] * VFilter[1]) + - ((int)src_ptr[2] * VFilter[2]) + - ((int)src_ptr[3] * VFilter[3]) + - ((int)src_ptr[4] * VFilter[4]) + - ((int)src_ptr[5] * VFilter[5]) + - ((int)src_ptr[6] * VFilter[6]) + - ((int)src_ptr[7] * VFilter[7]) + - (VP9_FILTER_WEIGHT >> 1); // Rounding - - // Normalize back to 0-255... - *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT); - src_ptr += intermediate_height; - } - src_ptr += intermediate_next_stride; - dst_ptr += dst_next_row_stride; - } - } -} - -void vp9_filter_block2d_4x4_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_4x4, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_8x4_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_8x4, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_8x8_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_8x8, dst_ptr, dst_stride); -} - -void vp9_filter_block2d_16x16_8_c(const uint8_t *src_ptr, - const unsigned int src_stride, - const int16_t *HFilter_aligned16, - const int16_t *VFilter_aligned16, - uint8_t *dst_ptr, - unsigned int dst_stride) { - filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16, - VPX_FILTER_16x16, dst_ptr, dst_stride); -} - -static void block2d_average_c(uint8_t *src, - unsigned int src_stride, - uint8_t *output_ptr, - unsigned int output_stride, - const filter_size_t filter_size) { - const unsigned int output_width = filter_size_to_wh[filter_size][0]; - const unsigned int output_height = filter_size_to_wh[filter_size][1]; - - unsigned int i, j; - for (i = 0; i < output_height; i++) { - for (j = 0; j < output_width; j++) { - output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1; - } - output_ptr += output_stride; - } -} - -#define block2d_average block2d_average_c - -void vp9_eighttap_predict4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_8[xoffset]; - VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - uint8_t tmp[4 * 4]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 4); - block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); -} - -void vp9_eighttap_predict4x4_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_8s[xoffset]; - VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict4x4_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_sub_pel_filters_8lp[xoffset]; - VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, - HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg4x4_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - uint8_t tmp[4 * 4]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 4); - block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); -} - -void vp9_eighttap_predict_avg4x4_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - uint8_t tmp[4 * 4]; - - vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 4); - block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4); -} - - -void vp9_eighttap_predict8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x8_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x8_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - uint8_t tmp[8 * 8]; - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 8); - block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); -} - -void vp9_eighttap_predict_avg8x8_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - uint8_t tmp[8 * 8]; - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 8); - block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); -} - -void vp9_eighttap_predict_avg8x8_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - uint8_t tmp[8 * 8]; - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp, - 8); - block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8); -} - -void vp9_eighttap_predict8x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x4_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict8x4_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict16x16_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict16x16_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - dst_ptr, dst_pitch); -} - -void vp9_eighttap_predict_avg16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16); - const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - tmp, 16); - block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); -} - -void vp9_eighttap_predict_avg16x16_sharp_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16); - const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - tmp, 16); - block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); -} - -void vp9_eighttap_predict_avg16x16_smooth_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16); - const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset]; - const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset]; - - vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter, - tmp, 16); - block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16); -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_first_pass - * - * INPUTS : uint8_t *src_ptr : Pointer to source block. - * uint32_t src_stride : Stride of source block. - * uint32_t height : Block height. - * uint32_t width : Block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : int32_t *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block - * in the horizontal direction to produce the filtered output - * block. Used to implement first-pass of 2-D separable filter. - * - * SPECIAL NOTES : Produces int32_t output to retain precision for next pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * - ****************************************************************************/ -static void filter_block2d_bil_first_pass(uint8_t *src_ptr, - uint16_t *dst_ptr, - unsigned int src_stride, - unsigned int height, - unsigned int width, - const int16_t *vp9_filter) { - unsigned int i, j; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply bilinear filter */ - dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[1] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; - src_ptr++; - } - - /* Next row... */ - src_ptr += src_stride - width; - dst_ptr += width; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil_second_pass - * - * INPUTS : int32_t *src_ptr : Pointer to source block. - * uint32_t dst_pitch : Destination block pitch. - * uint32_t height : Block height. - * uint32_t width : Block width. - * int32_t *vp9_filter : Array of 2 bi-linear filter taps. - * - * OUTPUTS : uint16_t *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block - * in the vertical direction to produce the filtered output - * block. Used to implement second-pass of 2-D separable filter. - * - * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. - * Two filter taps should sum to VP9_FILTER_WEIGHT. - * - ****************************************************************************/ -static void filter_block2d_bil_second_pass(uint16_t *src_ptr, - uint8_t *dst_ptr, - int dst_pitch, - unsigned int height, - unsigned int width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply filter */ - temp = ((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[width] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2); - dst_ptr[j] = (unsigned int)(temp >> VP9_FILTER_SHIFT); - src_ptr++; - } - - /* Next row... */ - dst_ptr += dst_pitch; - } -} - -/* - * As before for filter_block2d_second_pass_avg(), the functional difference - * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg() - * is that filter_block2d_bil_second_pass() does a bilinear filter on input - * and stores the result in output; filter_block2d_bil_second_pass_avg(), - * instead, does a bilinear filter on input, averages the resulting value - * with the values already present in the output and stores the result of - * that back into the output ((filter_result + dest + 1) >> 1). - */ -static void filter_block2d_bil_second_pass_avg(uint16_t *src_ptr, - uint8_t *dst_ptr, - int dst_pitch, - unsigned int height, - unsigned int width, - const int16_t *vp9_filter) { - unsigned int i, j; - int temp; - - for (i = 0; i < height; i++) { - for (j = 0; j < width; j++) { - /* Apply filter */ - temp = (((int)src_ptr[0] * vp9_filter[0]) + - ((int)src_ptr[width] * vp9_filter[1]) + - (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; - dst_ptr[j] = (unsigned int)((temp + dst_ptr[j] + 1) >> 1); - src_ptr++; - } - - /* Next row... */ - dst_ptr += dst_pitch; - } -} - -/**************************************************************************** - * - * ROUTINE : filter_block2d_bil - * - * INPUTS : uint8_t *src_ptr : Pointer to source block. - * uint32_t src_pitch : Stride of source block. - * uint32_t dst_pitch : Stride of destination block. - * int32_t *HFilter : Array of 2 horizontal filter taps. - * int32_t *VFilter : Array of 2 vertical filter taps. - * int32_t Width : Block width - * int32_t Height : Block height - * - * OUTPUTS : uint16_t *dst_ptr : Pointer to filtered block. - * - * RETURNS : void - * - * FUNCTION : 2-D filters an input block by applying a 2-tap - * bi-linear filter horizontally followed by a 2-tap - * bi-linear filter vertically on the result. - * - * SPECIAL NOTES : The largest block size can be handled here is 16x16 - * - ****************************************************************************/ -static void filter_block2d_bil(uint8_t *src_ptr, - uint8_t *dst_ptr, - unsigned int src_pitch, - unsigned int dst_pitch, - const int16_t *HFilter, - const int16_t *VFilter, - int Width, - int Height) { - - uint16_t FData[17 * 16]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); - - /* then 1-D vertically... */ - filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter); -} - -static void filter_block2d_bil_avg(uint8_t *src_ptr, - uint8_t *dst_ptr, - unsigned int src_pitch, - unsigned int dst_pitch, - const int16_t *HFilter, - const int16_t *VFilter, - int Width, - int Height) { - uint16_t FData[17 * 16]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter); - - /* then 1-D vertically... */ - filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter); -} - -void vp9_bilinear_predict4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4); -} - -void vp9_bilinear_predict_avg4x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 4, 4); -} - -void vp9_bilinear_predict8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8); - -} - -void vp9_bilinear_predict_avg8x8_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 8, 8); -} - -void vp9_bilinear_predict8x4_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4); - -} - -void vp9_bilinear_predict16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16); -} - -void vp9_bilinear_predict_avg16x16_c(uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - uint8_t *dst_ptr, - int dst_pitch) { - const int16_t *HFilter; - const int16_t *VFilter; - - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; - - filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line, - dst_pitch, HFilter, VFilter, 16, 16); -} diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h index cd666578d3cd53a7cd1d062b8553338dbbf41b79..1ccfdaac25c57f45ebe02179cb1458ba6abf62a1 100644 --- a/vp9/common/vp9_filter.h +++ b/vp9/common/vp9_filter.h @@ -21,10 +21,17 @@ #define SUBPEL_SHIFTS 16 -extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2]; -extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]; +extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]; extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]; extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]; extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]; +// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear +// filter kernel as a 2 tap filter. +#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \ + sizeof(vp9_bilinear_filters[0][0])) +#define BF_OFFSET (BF_LENGTH / 2 - 1) +#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET) + #endif // VP9_COMMON_VP9_FILTER_H_ diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c index 88f2ea9c183f002e143c732f57083fb185e75aa2..f2c889108133535ab5421abda3144790bb4b1b64 100644 --- a/vp9/common/vp9_findnearmv.c +++ b/vp9/common/vp9_findnearmv.c @@ -87,8 +87,8 @@ unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr, uint8_t temp2[2 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 3, 16, HFilter); @@ -108,8 +108,8 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr, uint8_t temp2[2 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 2, HFilter); diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 20de7b7f1d8b5f08c189d70384c5bcc92cb2c02b..d4435d872d94015992f283656f677f2107c0f004 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -8,66 +8,58 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include "./vpx_config.h" #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" void vp9_setup_interp_filters(MACROBLOCKD *xd, INTERPOLATIONFILTERTYPE mcomp_filter_type, VP9_COMMON *cm) { + // TODO(agrange): Investigate the best choice of functions to use here + // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what + // to do at full-pel offsets. The current selection, where the filter is + // applied in one direction only, and not at all for 0,0, seems to give the + // best quality, but it may be worth trying an additional mode that does + // do the filtering on full-pel. + xd->subpix.predict[0][0][0] = vp9_convolve_copy; + xd->subpix.predict[0][0][1] = vp9_convolve_avg; + xd->subpix.predict[0][1][0] = vp9_convolve8_vert; + xd->subpix.predict[0][1][1] = vp9_convolve8_avg_vert; + xd->subpix.predict[1][0][0] = vp9_convolve8_horiz; + xd->subpix.predict[1][0][1] = vp9_convolve8_avg_horiz; + xd->subpix.predict[1][1][0] = vp9_convolve8; + xd->subpix.predict[1][1][1] = vp9_convolve8_avg; + + xd->subpix.x_step_q4 = 16; + xd->subpix.y_step_q4 = 16; + switch (mcomp_filter_type) { + case EIGHTTAP: + case SWITCHABLE: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8; + break; + case EIGHTTAP_SMOOTH: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp; + break; + case EIGHTTAP_SHARP: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s; + break; + case BILINEAR: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters; + break; #if CONFIG_ENABLE_6TAP - if (mcomp_filter_type == SIXTAP) { - xd->subpixel_predict4x4 = vp9_sixtap_predict4x4; - xd->subpixel_predict8x4 = vp9_sixtap_predict8x4; - xd->subpixel_predict8x8 = vp9_sixtap_predict8x8; - xd->subpixel_predict16x16 = vp9_sixtap_predict16x16; - xd->subpixel_predict_avg4x4 = vp9_sixtap_predict_avg4x4; - xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16; - } else { + case SIXTAP: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6; + break; #endif - if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) { - xd->subpixel_predict4x4 = vp9_eighttap_predict4x4; - xd->subpixel_predict8x4 = vp9_eighttap_predict8x4; - xd->subpixel_predict8x8 = vp9_eighttap_predict8x8; - xd->subpixel_predict16x16 = vp9_eighttap_predict16x16; - xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4; - xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16; - } else if (mcomp_filter_type == EIGHTTAP_SMOOTH) { - xd->subpixel_predict4x4 = vp9_eighttap_predict4x4_smooth; - xd->subpixel_predict8x4 = vp9_eighttap_predict8x4_smooth; - xd->subpixel_predict8x8 = vp9_eighttap_predict8x8_smooth; - xd->subpixel_predict16x16 = vp9_eighttap_predict16x16_smooth; - xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_smooth; - xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_smooth; - xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_smooth; - } else if (mcomp_filter_type == EIGHTTAP_SHARP) { - xd->subpixel_predict4x4 = vp9_eighttap_predict4x4_sharp; - xd->subpixel_predict8x4 = vp9_eighttap_predict8x4_sharp; - xd->subpixel_predict8x8 = vp9_eighttap_predict8x8_sharp; - xd->subpixel_predict16x16 = vp9_eighttap_predict16x16_sharp; - xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_sharp; - xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp; - xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c; - } else { - xd->subpixel_predict4x4 = vp9_bilinear_predict4x4; - xd->subpixel_predict8x4 = vp9_bilinear_predict8x4; - xd->subpixel_predict8x8 = vp9_bilinear_predict8x8; - xd->subpixel_predict16x16 = vp9_bilinear_predict16x16; - xd->subpixel_predict_avg4x4 = vp9_bilinear_predict_avg4x4; - xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8; - xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16; - } -#if CONFIG_ENABLE_6TAP } -#endif } -void vp9_copy_mem16x16_c(uint8_t *src, +void vp9_copy_mem16x16_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride) { @@ -93,10 +85,10 @@ void vp9_copy_mem16x16_c(uint8_t *src, dst[15] = src[15]; #else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; - ((uint32_t *)dst)[2] = ((uint32_t *)src)[2]; - ((uint32_t *)dst)[3] = ((uint32_t *)src)[3]; + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; + ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2]; + ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3]; #endif src += src_stride; @@ -104,25 +96,7 @@ void vp9_copy_mem16x16_c(uint8_t *src, } } -void vp9_avg_mem16x16_c(uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 16; r++) { - int n; - - for (n = 0; n < 16; n++) { - dst[n] = (dst[n] + src[n] + 1) >> 1; - } - - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x8_c(uint8_t *src, +void vp9_copy_mem8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride) { @@ -139,33 +113,15 @@ void vp9_copy_mem8x8_c(uint8_t *src, dst[6] = src[6]; dst[7] = src[7]; #else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; #endif src += src_stride; dst += dst_stride; } } -void vp9_avg_mem8x8_c(uint8_t *src, - int src_stride, - uint8_t *dst, - int dst_stride) { - int r; - - for (r = 0; r < 8; r++) { - int n; - - for (n = 0; n < 8; n++) { - dst[n] = (dst[n] + src[n] + 1) >> 1; - } - - src += src_stride; - dst += dst_stride; - } -} - -void vp9_copy_mem8x4_c(uint8_t *src, +void vp9_copy_mem8x4_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride) { @@ -182,16 +138,16 @@ void vp9_copy_mem8x4_c(uint8_t *src, dst[6] = src[6]; dst[7] = src[7]; #else - ((uint32_t *)dst)[0] = ((uint32_t *)src)[0]; - ((uint32_t *)dst)[1] = ((uint32_t *)src)[1]; + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; #endif src += src_stride; dst += dst_stride; } } -void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) { - int r; +void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, + struct subpix_fn_table *subpix) { uint8_t *ptr_base; uint8_t *ptr; uint8_t *pred_ptr = d->predictor; @@ -199,30 +155,14 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) { ptr_base = *(d->base_pre); mv.as_int = d->bmi.as_mv.first.as_int; + ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + + (mv.as_mv.col >> 3); - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1, - pred_ptr, pitch); - } else { - ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - ptr = ptr_base; - - for (r = 0; r < 4; r++) { -#if !(CONFIG_FAST_UNALIGNED) - pred_ptr[0] = ptr[0]; - pred_ptr[1] = ptr[1]; - pred_ptr[2] = ptr[2]; - pred_ptr[3] = ptr[3]; -#else - *(uint32_t *)pred_ptr = *(uint32_t *)ptr; -#endif - pred_ptr += pitch; - ptr += d->pre_stride; - } - } + subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0]( + ptr, d->pre_stride, pred_ptr, pitch, + subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4, + subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4, + 4, 4); } /* @@ -232,8 +172,7 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) { * predictor of the second reference frame / motion vector. */ void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf) { - int r; + struct subpix_fn_table *subpix) { uint8_t *ptr_base; uint8_t *ptr; uint8_t *pred_ptr = d->predictor; @@ -241,26 +180,14 @@ void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, ptr_base = *(d->base_second_pre); mv.as_int = d->bmi.as_mv.second.as_int; + ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + + (mv.as_mv.col >> 3); - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1, - pred_ptr, pitch); - } else { - ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride + - (mv.as_mv.col >> 3); - ptr = ptr_base; - - for (r = 0; r < 4; r++) { - pred_ptr[0] = (pred_ptr[0] + ptr[0] + 1) >> 1; - pred_ptr[1] = (pred_ptr[1] + ptr[1] + 1) >> 1; - pred_ptr[2] = (pred_ptr[2] + ptr[2] + 1) >> 1; - pred_ptr[3] = (pred_ptr[3] + ptr[3] + 1) >> 1; - pred_ptr += pitch; - ptr += d->pre_stride; - } - } + subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1]( + ptr, d->pre_stride, pred_ptr, pitch, + subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4, + subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4, + 4, 4); } void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { @@ -274,12 +201,11 @@ void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + (mv.as_mv.col >> 3); - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); - } else { - vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch); - } + xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0]( + ptr, d->pre_stride, pred_ptr, pitch, + xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4, + 8, 8); } /* @@ -300,12 +226,11 @@ void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd, ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + (mv.as_mv.col >> 3); - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); - } else { - vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch); - } + xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1]( + ptr, d->pre_stride, pred_ptr, pitch, + xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4, + 8, 8); } static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { @@ -319,12 +244,11 @@ static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) { ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride + (mv.as_mv.col >> 3); - if (mv.as_mv.row & 7 || mv.as_mv.col & 7) { - xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, - (mv.as_mv.row & 7) << 1, pred_ptr, pitch); - } else { - vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch); - } + xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0]( + ptr, d->pre_stride, pred_ptr, pitch, + xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4, + 8, 4); } /*encoder only*/ @@ -411,13 +335,13 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) { if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) build_inter_predictors2b(xd, d0, 8); else { - vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4); - vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4); + vp9_build_inter_predictors_b(d0, 8, &xd->subpix); + vp9_build_inter_predictors_b(d1, 8, &xd->subpix); } if (xd->mode_info_context->mbmi.second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4); - vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4); + vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix); + vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix); } } } @@ -475,14 +399,11 @@ void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd, ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3); - if ((ymv.as_mv.row | ymv.as_mv.col) & 7) { - xd->subpixel_predict16x16(ptr, pre_stride, - (ymv.as_mv.col & 7) << 1, - (ymv.as_mv.row & 7) << 1, - dst_y, dst_ystride); - } else { - vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride); - } + xd->subpix.predict[!!(ymv.as_mv.col & 7)][!!(ymv.as_mv.row & 7)][0]( + ptr, pre_stride, dst_y, dst_ystride, + xd->subpix.filter_x[(ymv.as_mv.col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(ymv.as_mv.row & 7) << 1], xd->subpix.y_step_q4, + 16, 16); } void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd, @@ -523,15 +444,19 @@ void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd, uptr = xd->pre.u_buffer + offset; vptr = xd->pre.v_buffer + offset; - if (_o16x16mv.as_int & 0x000f000f) { - xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15, - _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride); - xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15, - _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride); - } else { - vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); - vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); - } + xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)] + [!!(_o16x16mv.as_mv.row & 15)][0]( + uptr, pre_stride, dst_u, dst_uvstride, + xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4, + xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4, + 8, 8); + + xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)] + [!!(_o16x16mv.as_mv.row & 15)][0]( + vptr, pre_stride, dst_v, dst_uvstride, + xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4, + xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4, + 8, 8); } @@ -714,12 +639,11 @@ void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd, ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); - if ((mv_row | mv_col) & 7) { - xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1, - (mv_row & 7) << 1, dst_y, dst_ystride); - } else { - vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride); - } + xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][1]( + ptr, pre_stride, dst_y, dst_ystride, + xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4, + 16, 16); } void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd, @@ -758,15 +682,17 @@ void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd, uptr = xd->second_pre.u_buffer + offset; vptr = xd->second_pre.v_buffer + offset; - if ((omv_row | omv_col) & 15) { - xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15, - omv_row & 15, dst_u, dst_uvstride); - xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15, - omv_row & 15, dst_v, dst_uvstride); - } else { - vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride); - vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride); - } + xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1]( + uptr, pre_stride, dst_u, dst_uvstride, + xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4, + xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4, + 8, 8); + + xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1]( + vptr, pre_stride, dst_v, dst_uvstride, + xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4, + xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4, + 8, 8); } void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd, @@ -835,13 +761,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) { if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) build_inter_predictors2b(xd, d0, 16); else { - vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict4x4); - vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict4x4); + vp9_build_inter_predictors_b(d0, 16, &xd->subpix); + vp9_build_inter_predictors_b(d1, 16, &xd->subpix); } if (mbmi->second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg4x4); - vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg4x4); + vp9_build_2nd_inter_predictors_b(d0, 16, &xd->subpix); + vp9_build_2nd_inter_predictors_b(d1, 16, &xd->subpix); } } } @@ -853,13 +779,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) { if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int) build_inter_predictors2b(xd, d0, 8); else { - vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4); - vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4); + vp9_build_inter_predictors_b(d0, 8, &xd->subpix); + vp9_build_inter_predictors_b(d1, 8, &xd->subpix); } if (mbmi->second_ref_frame > 0) { - vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4); - vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4); + vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix); + vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix); } } } diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index 89868b95efcadda7411b92d0cc3287e87f3f6950..903bd2e86ddc35b783270b9e9a05e357f9c6e8bf 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -14,6 +14,8 @@ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_onyxc_int.h" +struct subpix_fn_table; + extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd, uint8_t *dst_y, int dst_ystride, @@ -64,10 +66,10 @@ extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x, extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd); extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf); + struct subpix_fn_table *sppf); extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch, - vp9_subpix_fn_t sppf); + struct subpix_fn_table *sppf); extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch); diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 762dd75c0ab8d6c8cf1de300afb33b233f6152b6..9698172b2a6facb2279ca3da437bff4d85ff7c17 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -23,21 +23,6 @@ EOF } forward_decls vp9_common_forward_decls -prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" -prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" -prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" -prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride" - -# At the very least, MSVC 2008 has compiler bug exhibited by this code; code -# compiles warning free but a dissassembly of generated code show bugs. To be -# on the safe side, only enabled when compiled with 'gcc'. -if [ "$CONFIG_GCC" = "yes" ]; then - specialize vp9_filter_block2d_4x4_8 sse4_1 sse2 -fi - specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2 - specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2 - specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2 - # # Dequant # @@ -86,27 +71,17 @@ specialize vp9_dequant_idct_add_uv_block_16x16 # # RECON # -prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" specialize vp9_copy_mem16x16 mmx sse2 dspr2 vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2 -prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" specialize vp9_copy_mem8x8 mmx dspr2 vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2 -prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" specialize vp9_copy_mem8x4 mmx -prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_avg_mem16x16 - -prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_avg_mem8x8 - -prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" -specialize vp9_copy_mem8x4 mmx dspr2 -vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2 - prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride" specialize vp9_recon_b @@ -287,111 +262,6 @@ specialize vp9_convolve8_avg_horiz prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8_avg_vert -prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict16x16 - -prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x8 - -prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg16x16 - -prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg8x8 - -prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg4x4 - -prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x4 - -prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict4x4 - -prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict16x16_sharp - -prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x8_sharp - -prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg16x16_sharp - -prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg8x8_sharp - -prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg4x4_sharp - -prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x4_sharp - -prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict4x4_sharp - -prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict16x16_smooth - -prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x8_smooth - -prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg16x16_smooth - -prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg8x8_smooth - -prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict_avg4x4_smooth - -prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict8x4_smooth - -prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_eighttap_predict4x4_smooth - -prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict16x16 - -prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict8x8 - -prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict_avg16x16 - -prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict_avg8x8 - -prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict8x4 - -prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict4x4 - -prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_sixtap_predict_avg4x4 - -prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict16x16 sse2 - -prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict8x8 sse2 - -prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict_avg16x16 - -prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict_avg8x8 - -prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict8x4 - -prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict4x4 - -prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch" -specialize vp9_bilinear_predict_avg4x4 - # # dct # diff --git a/vp9/common/vp9_subpixel.h b/vp9/common/vp9_subpixel.h deleted file mode 100644 index dc4eadfb190c4650ca1beb965e53f4aeb92b96bd..0000000000000000000000000000000000000000 --- a/vp9/common/vp9_subpixel.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_COMMON_VP9_SUBPIXEL_H_ -#define VP9_COMMON_VP9_SUBPIXEL_H_ - -#define prototype_subpixel_predict(sym) \ - void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \ - uint8_t *dst, int dst_pitch) - -typedef prototype_subpixel_predict((*vp9_subpix_fn_t)); - -#endif // VP9_COMMON_VP9_SUBPIXEL_H_ diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index f09e2d78be86786cbb07a3aeace47d73f02e1172..d233247b2bf4f0cb2f966ccf4b7f3a842320c4d4 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -11,88 +11,6 @@ #include "./vpx_config.h" #include "vpx_ports/mem.h" -#include "vp9/common/vp9_subpixel.h" - -extern const short vp9_six_tap_mmx[8][6 * 8]; - -extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr, - unsigned char *output_ptr, - int output_pitch, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr, - unsigned char *output_ptr, - int dst_ptich, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const short *vp9_filter); - -extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int output_height, - unsigned int output_width); - -extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - -extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_lin, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - -extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - int dst_pitch, - unsigned int output_height, - const short *vp9_filter); - /////////////////////////////////////////////////////////////////////////// // the mmx function that does the bilinear filtering and var calculation // // int one pass // @@ -115,487 +33,3 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { { 16, 16, 16, 16, 112, 112, 112, 112 }, { 8, 8, 8, 8, 120, 120, 120, 120 } }; - -#if HAVE_MMX -void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict4x4_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16); - const short *hfilter, *vfilter; - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 9, 8, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch, - 8, 4, 4, 4, vfilter); -} - -void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, - fdata2 + 8, src_pixels_per_line, 1, 21, 32, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, - fdata2 + 12, src_pixels_per_line, 1, 21, 32, - hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch, - 32, 16, 16, 16, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch, - 32, 16, 16, 16, vfilter); -} - -void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 13, 16, - hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 13, 16, - hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 8, 8, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, - 16, 8, 8, 8, vfilter); -} - -void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_mmx\n"); -#endif - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; - - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), - fdata2, src_pixels_per_line, 1, 9, 16, hfilter); - vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, - fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter); - - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 4, 8, vfilter); - vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch, - 16, 8, 4, 8, vfilter); -} -#endif - -#if HAVE_SSE2 -void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 21, 32, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 16, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 21, 32); - vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch, - 32, 16, 16, dst_pitch, vfilter); - } -} - -void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 13, 16, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 8, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 8, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 8, vfilter); - } -} - -void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - /* Temp data bufffer used in filtering */ - DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256); - const short *hfilter, *vfilter; -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_sse2\n"); -#endif - - if (xoffset) { - if (yoffset) { - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2, - src_pixels_per_line, 1, 9, 16, hfilter); - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch, - 16, 8, 4, dst_pitch, vfilter); - } else { - /* First-pass only */ - hfilter = vp9_six_tap_mmx[xoffset]; - vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, hfilter); - } - } else { - /* Second-pass only */ - vfilter = vp9_six_tap_mmx[yoffset]; - vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, vfilter); - } -} -#endif - -#if HAVE_SSSE3 -extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - unsigned int vp9_filter_index); - -void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict16x16_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - fdata2, 16, 21, xoffset); - vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch, - 16, yoffset); - } else { - /* First-pass only */ - vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 16, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 16, yoffset); - } -} - -void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x8_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 8, 13, xoffset); - vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset); - } else { - vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 8, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 8, yoffset); - } -} - -void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict8x4_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 8, 9, xoffset); - vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset); - } else { - /* First-pass only */ - vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } else { - /* Second-pass only */ - vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } -} - -void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - unsigned char *dst_ptr, - int dst_pitch) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9); -#ifdef ANNOUNCE_FUNCTION - printf("vp9_sixtap_predict4x4_ssse3\n"); -#endif - - if (xoffset) { - if (yoffset) { - vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, fdata2, 4, 9, xoffset); - vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset); - } else { - vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, - dst_ptr, dst_pitch, 4, xoffset); - } - } else { - vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), - src_pixels_per_line, - dst_ptr, dst_pitch, 4, yoffset); - } -} - -void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); - - vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 23, hfilter_aligned16); - vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, - 16, hfilter_aligned16); - } else { - vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 16, vfilter_aligned16); - } - } -} - -void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, - const unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - const short *filter); - -void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); - - vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 15, hfilter_aligned16); - vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8, - hfilter_aligned16); - } else { - vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 8, vfilter_aligned16); - } - } -} - -void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr, - const unsigned int src_stride, - const short *hfilter_aligned16, - const short *vfilter_aligned16, - unsigned char *dst_ptr, - unsigned int dst_stride) { - if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) { - DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16); - - vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride, - fdata2, 16, 11, hfilter_aligned16); - vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4, - vfilter_aligned16); - } else { - if (hfilter_aligned16[3] != 128) { - vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4, - hfilter_aligned16); - } else { - vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride, - dst_ptr, dst_stride, 4, vfilter_aligned16); - } - } -} -#endif diff --git a/vp9/common/x86/vp9_filter_sse2.c b/vp9/common/x86/vp9_filter_sse2.c deleted file mode 100644 index 8e02ac1975eb13597aedab4c70b33189c8fd7954..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_filter_sse2.c +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> // for alignment checks -#include <emmintrin.h> // SSE2 -#include "vp9/common/vp9_filter.h" -#include "vpx_ports/emmintrin_compat.h" -#include "vpx_ports/mem.h" // for DECLARE_ALIGNED -#include "vp9_rtcd.h" - -// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is -// just a quick partial snapshot so that other can already use some -// speedup. -// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap -// filtering. -// TODO(cd): Add some comments, better variable naming. -// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum -// of positive above 128), or have higher precision filter -// coefficients. - -DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, -}; - -// Creating a macro to do more than four pixels at once to hide instruction -// latency is actually slower :-( -#define DO_FOUR_PIXELS(result, src_ptr, offset) \ - { \ - /* Do shifted load to achieve require shuffles through unpacking */ \ - const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \ - const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \ - const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \ - const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \ - const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \ - const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \ - const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \ - const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \ - /* Shit by 4 bytes through suffle to get additional shifted loads */ \ - const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \ - const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \ - const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \ - const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \ - const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \ - /* multiply accumulate them */ \ - const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ - const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ - const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ - const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ - const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ - const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ - mad_all = _mm_add_epi32(mad_all, rounding); \ - result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ - } - -void vp9_filter_block2d_4x4_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - __m128i intermediateA, intermediateB, intermediateC; - - const int kInterp_Extend = 4; - - const __m128i zero = _mm_set1_epi16(0); - const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); - - // check alignment - assert(0 == ((long)HFilter_aligned16)%16); - assert(0 == ((long)VFilter_aligned16)%16); - - { - __m128i transpose3_0; - __m128i transpose3_1; - __m128i transpose3_2; - __m128i transpose3_3; - - // Horizontal pass (src -> intermediate). - { - const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - - { - __m128i mad_all0; - __m128i mad_all1; - __m128i mad_all2; - __m128i mad_all3; - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateA = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateB = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); - intermediateC = _mm_packus_epi16(mad_all0, mad_all2); - } - } - - // Transpose result (intermediate -> transpose3_x) - { - // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 - // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 - // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx - const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB); - const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB); - const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC); - const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC); - // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53 - // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73 - // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx - // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx - const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3); - const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3); - // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63 - // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73 - // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx - // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx - const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1); - const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1); - const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3); - const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx - // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx - transpose3_0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose2_2), - _MM_SHUFFLE(1, 0, 1, 0))); - transpose3_1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose2_2), - _MM_SHUFFLE(3, 2, 3, 2))); - transpose3_2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose2_3), - _MM_SHUFFLE(1, 0, 1, 0))); - transpose3_3 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose2_3), - _MM_SHUFFLE(3, 2, 3, 2))); - // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx - // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx - // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx - // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx - } - - // Vertical pass (transpose3_x -> dst). - { - const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i col0, col1, col2, col3; - DECLARE_ALIGNED(16, unsigned char, temp[32]); - { - _mm_store_si128((__m128i *)temp, transpose3_0); - DO_FOUR_PIXELS(col0, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_1); - DO_FOUR_PIXELS(col1, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_2); - DO_FOUR_PIXELS(col2, temp, 0); - } - { - _mm_store_si128((__m128i *)temp, transpose3_3); - DO_FOUR_PIXELS(col3, temp, 0); - } - // transpose - { - __m128i T0 = _mm_unpacklo_epi32(col0, col1); - __m128i T1 = _mm_unpacklo_epi32(col2, col3); - __m128i T2 = _mm_unpackhi_epi32(col0, col1); - __m128i T3 = _mm_unpackhi_epi32(col2, col3); - col0 = _mm_unpacklo_epi64(T0, T1); - col1 = _mm_unpackhi_epi64(T0, T1); - col2 = _mm_unpacklo_epi64(T2, T3); - col3 = _mm_unpackhi_epi64(T2, T3); - } - // saturate to 8 bit - { - col0 = _mm_packs_epi32(col0, col0); - col0 = _mm_packus_epi16(col0, col0); - col1 = _mm_packs_epi32(col1, col1); - col1 = _mm_packus_epi16(col1, col1); - col2 = _mm_packs_epi32 (col2, col2); - col2 = _mm_packus_epi16(col2, col2); - col3 = _mm_packs_epi32 (col3, col3); - col3 = _mm_packus_epi16(col3, col3); - } - // store - { - *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0); - *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1); - *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2); - *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3); - } - } - } -} - -void vp9_filter_block2d_8x4_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int j; - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j, dst_stride); - } -} - -void vp9_filter_block2d_8x8_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<8; i+=4) { - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} - -void vp9_filter_block2d_16x16_8_sse2 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<16; i+=4) { - for (j=0; j<16; j+=4) { - vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} diff --git a/vp9/common/x86/vp9_filter_sse4.c b/vp9/common/x86/vp9_filter_sse4.c deleted file mode 100644 index 52c35b29689d85f0e33778feaee08139772e2b01..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_filter_sse4.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <assert.h> // for alignment checks -#include <smmintrin.h> // SSE4.1 -#include "vp9/common/vp9_filter.h" -#include "vpx_ports/mem.h" // for DECLARE_ALIGNED -#include "vp9_rtcd.h" - -// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is -// just a quick partial snapshot so that other can already use some -// speedup. -// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap -// filtering. -// TODO(cd): Reduce source size by using macros instead of current code -// duplication. -// TODO(cd): Add some comments, better variable naming. -// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum -// of positive above 128), or have higher precision filter -// coefficients. - -DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = { - 0x00, 0x01, - 0x01, 0x02, - 0x02, 0x03, - 0x03, 0x04, - 0x02, 0x03, - 0x03, 0x04, - 0x04, 0x05, - 0x05, 0x06, -}; -DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = { - 0x04, 0x05, - 0x05, 0x06, - 0x06, 0x07, - 0x07, 0x08, - 0x06, 0x07, - 0x07, 0x08, - 0x08, 0x09, - 0x09, 0x0A, -}; -DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, - VP9_FILTER_WEIGHT >> 1, -}; -DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = { - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 -}; - -// Creating a macro to do more than four pixels at once to hide instruction -// latency is actually slower :-( -#define DO_FOUR_PIXELS(result, offset) \ - { \ - /*load pixels*/ \ - __m128i src = _mm_loadu_si128((const __m128i *)(src_ptr + offset)); \ - /* extract the ones used for first column */ \ - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); \ - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); \ - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); \ - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); \ - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); \ - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); \ - /* multiply accumulate them */ \ - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ - __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ - __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ - mad_all = _mm_add_epi32(mad_all, rounding); \ - result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \ - } - -void vp9_filter_block2d_4x4_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - __m128i intermediateA, intermediateB, intermediateC; - - const int kInterp_Extend = 4; - - const __m128i zero = _mm_set1_epi16(0); - const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c); - const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c); - const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); - const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c); - - // check alignment - assert(0 == ((long)HFilter_aligned16)%16); - assert(0 == ((long)VFilter_aligned16)%16); - - { - __m128i transpose3_0; - __m128i transpose3_1; - __m128i transpose3_2; - __m128i transpose3_3; - - // Horizontal pass (src -> intermediate). - { - const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); - src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); - - { - __m128i mad_all0; - __m128i mad_all1; - __m128i mad_all2; - __m128i mad_all3; - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateA = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - DO_FOUR_PIXELS(mad_all3, 3*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); - intermediateB = _mm_packus_epi16(mad_all0, mad_all2); - // -- - src_ptr += src_stride*4; - // -- - DO_FOUR_PIXELS(mad_all0, 0*src_stride) - DO_FOUR_PIXELS(mad_all1, 1*src_stride) - DO_FOUR_PIXELS(mad_all2, 2*src_stride) - mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); - mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); - intermediateC = _mm_packus_epi16(mad_all0, mad_all2); - } - } - - // Transpose result (intermediate -> transpose3_x) - { - // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 - // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 - // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx - const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose); - const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose); - const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose); - // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 - // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 - // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx - const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1); - const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1); - // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 - transpose3_0 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(0, 0, 1, 0))); - transpose3_1 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(1, 1, 3, 2))); - transpose3_2 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(2, 2, 1, 0))); - transpose3_3 = _mm_castps_si128( - _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), - _mm_castsi128_ps(transpose1_2), - _MM_SHUFFLE(3, 3, 3, 2))); - // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx - // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx - // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx - // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx - } - - // Vertical pass (transpose3_x -> dst). - { - const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); - // get first two columns filter coefficients - __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i col0, col1, col2, col3; - { - //load pixels - __m128i src = transpose3_0; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col0 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_1; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col1 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_2; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col2 = _mm_packus_epi16(mad_all, mad_all); - } - { - //load pixels - __m128i src = transpose3_3; - // extract the ones used for first column - __m128i src0123 = _mm_shuffle_epi8(src, mask0123); - __m128i src4567 = _mm_shuffle_epi8(src, mask4567); - __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); - __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); - __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); - __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); - // multiply accumulate them - __m128i mad01 = _mm_madd_epi16(src01_16, fil01); - __m128i mad23 = _mm_madd_epi16(src23_16, fil23); - __m128i mad45 = _mm_madd_epi16(src45_16, fil45); - __m128i mad67 = _mm_madd_epi16(src67_16, fil67); - __m128i mad0123 = _mm_add_epi32(mad01, mad23); - __m128i mad4567 = _mm_add_epi32(mad45, mad67); - __m128i mad_all = _mm_add_epi32(mad0123, mad4567); - mad_all = _mm_add_epi32(mad_all, rounding); - mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); - mad_all = _mm_packs_epi32(mad_all, mad_all); - col3 = _mm_packus_epi16(mad_all, mad_all); - } - { - __m128i col01 = _mm_unpacklo_epi8(col0, col1); - __m128i col23 = _mm_unpacklo_epi8(col2, col3); - __m128i col0123 = _mm_unpacklo_epi16(col01, col23); - //TODO(cd): look into Ronald's comment: - // Future suggestion: I believe here, too, you can merge the - // packs_epi32() and pacus_epi16() for the 4 cols above, so that - // you get the data in a single register, and then use pshufb - // (shuffle_epi8()) instead of the unpacks here. Should be - // 2+3+2 instructions faster. - *((unsigned int *)&dst_ptr[dst_stride * 0]) = - _mm_extract_epi32(col0123, 0); - *((unsigned int *)&dst_ptr[dst_stride * 1]) = - _mm_extract_epi32(col0123, 1); - *((unsigned int *)&dst_ptr[dst_stride * 2]) = - _mm_extract_epi32(col0123, 2); - *((unsigned int *)&dst_ptr[dst_stride * 3]) = - _mm_extract_epi32(col0123, 3); - } - } - } -} - -void vp9_filter_block2d_8x4_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int j; - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j, dst_stride); - } -} - -void vp9_filter_block2d_8x8_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<8; i+=4) { - for (j=0; j<8; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} - -void vp9_filter_block2d_16x16_8_sse4_1 -( - const unsigned char *src_ptr, const unsigned int src_stride, - const short *HFilter_aligned16, const short *VFilter_aligned16, - unsigned char *dst_ptr, unsigned int dst_stride -) { - int i, j; - for (i=0; i<16; i+=4) { - for (j=0; j<16; j+=4) { - vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride, - HFilter_aligned16, VFilter_aligned16, - dst_ptr + j + i*dst_stride, dst_stride); - } - } -} diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm deleted file mode 100644 index c6d65e904379586afdb5a923af3cfc1110d578a0..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ /dev/null @@ -1,550 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -; -; This is an implementation of some of the SSE optimizations first seen in ffvp8 -; -;*************************************************************************************/ - -;void vp9_filter_block1d8_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE -sym(vp9_filter_block1d8_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - lea rbx, [rdx + rdx*4] - add rbx, rdx ;pitch * 6 - -.vp9_filter_block1d8_v8_ssse3_loop: - movq xmm0, [rsi] ;A - movq xmm1, [rsi + rdx] ;B - movq xmm2, [rsi + rdx * 2] ;C - movq xmm3, [rax + rdx * 2] ;D - movq xmm4, [rsi + rdx * 4] ;E - movq xmm5, [rax + rdx * 4] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - movq xmm6, [rsi + rbx] ;G - movq xmm7, [rax + rbx] ;H - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - punpcklbw xmm6, xmm7 ;G H - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - paddsw xmm0, xmm2 - paddsw xmm0, krd - paddsw xmm4, xmm6 - paddsw xmm0, xmm4 - - psraw xmm0, 7 - packuswb xmm0, xmm0 - - add rsi, rdx - add rax, rdx - - movq [rdi], xmm0 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v8_ssse3_loop - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d16_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE -sym(vp9_filter_block1d16_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - lea rbx, [rdx + rdx*4] - add rbx, rdx ;pitch * 6 - -.vp9_filter_block1d16_v8_ssse3_loop: - movq xmm0, [rsi] ;A - movq xmm1, [rsi + rdx] ;B - movq xmm2, [rsi + rdx * 2] ;C - movq xmm3, [rax + rdx * 2] ;D - movq xmm4, [rsi + rdx * 4] ;E - movq xmm5, [rax + rdx * 4] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - movq xmm6, [rsi + rbx] ;G - movq xmm7, [rax + rbx] ;H - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - punpcklbw xmm6, xmm7 ;G H - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - paddsw xmm0, xmm2 - paddsw xmm0, krd - paddsw xmm4, xmm6 - paddsw xmm0, xmm4 - - psraw xmm0, 7 - packuswb xmm0, xmm0 - - movq [rdi], xmm0 - - movq xmm0, [rsi + 8] ;A - movq xmm1, [rsi + rdx + 8] ;B - movq xmm2, [rsi + rdx * 2 + 8] ;C - movq xmm3, [rax + rdx * 2 + 8] ;D - movq xmm4, [rsi + rdx * 4 + 8] ;E - movq xmm5, [rax + rdx * 4 + 8] ;F - - punpcklbw xmm0, xmm1 ;A B - punpcklbw xmm2, xmm3 ;C D - punpcklbw xmm4, xmm5 ;E F - - - movq xmm6, [rsi + rbx + 8] ;G - movq xmm7, [rax + rbx + 8] ;H - punpcklbw xmm6, xmm7 ;G H - - - pmaddubsw xmm0, k0k1 - pmaddubsw xmm2, k2k3 - pmaddubsw xmm4, k4k5 - pmaddubsw xmm6, k6k7 - - paddsw xmm0, xmm2 - paddsw xmm4, xmm6 - paddsw xmm0, krd - paddsw xmm0, xmm4 - - psraw xmm0, 7 - packuswb xmm0, xmm0 - - add rsi, rdx - add rax, rdx - - movq [rdi+8], xmm0 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v8_ssse3_loop - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d8_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE -sym(vp9_filter_block1d8_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 -; movdqa krd, xmm5 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rdx, dword ptr arg(3) ;output_pitch - movsxd rcx, dword ptr arg(4) ;output_height - -.filter_block1d8_h8_rowloop_ssse3: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 - -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 - punpcklqdq xmm0, xmm3 - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - paddsw xmm0, xmm4 - psraw xmm0, 7 - packuswb xmm0, xmm0 - - lea rsi, [rsi + rax] - movq [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d8_h8_rowloop_ssse3 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d16_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE -sym(vp9_filter_block1d16_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] - - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm4, [rdx] ;load filters - movd xmm5, rcx - packsswb xmm4, xmm4 - pshuflw xmm0, xmm4, 0b ;k0_k1 - pshuflw xmm1, xmm4, 01010101b ;k2_k3 - pshuflw xmm2, xmm4, 10101010b ;k4_k5 - pshuflw xmm3, xmm4, 11111111b ;k6_k7 - - punpcklqdq xmm0, xmm0 - punpcklqdq xmm1, xmm1 - punpcklqdq xmm2, xmm2 - punpcklqdq xmm3, xmm3 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm1 - pshufd xmm5, xmm5, 0 - movdqa k4k5, xmm2 - movdqa k6k7, xmm3 - movdqa krd, xmm5 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rdx, dword ptr arg(3) ;output_pitch - movsxd rcx, dword ptr arg(4) ;output_height - -.filter_block1d16_h8_rowloop_ssse3: - movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 - -; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11 - movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 -;note: if we create a k0_k7 filter, we can save a pshufb -; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11 - punpcklqdq xmm0, xmm3 - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm0, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 - paddsw xmm0, xmm2 - paddsw xmm0, krd - psraw xmm0, 7 - packuswb xmm0, xmm0 - - - movq xmm3, [rsi + 5] -; movq xmm7, [rsi + 12] - movq xmm7, [rsi + 13] -;note: same as above -; punpcklbw xmm3, xmm7 - punpcklqdq xmm3, xmm7 - - movdqa xmm1, xmm3 - pshufb xmm3, [GLOBAL(shuf_t0t1)] - pmaddubsw xmm3, k0k1 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf_t2t3)] - pmaddubsw xmm1, k2k3 - - movdqa xmm4, xmm2 - pshufb xmm2, [GLOBAL(shuf_t4t5)] - pmaddubsw xmm2, k4k5 - - pshufb xmm4, [GLOBAL(shuf_t6t7)] - pmaddubsw xmm4, k6k7 - - paddsw xmm3, xmm1 - paddsw xmm3, xmm2 - paddsw xmm3, krd - paddsw xmm3, xmm4 - psraw xmm3, 7 - packuswb xmm3, xmm3 - punpcklqdq xmm0, xmm3 - - lea rsi, [rsi + rax] - movdqa [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d16_h8_rowloop_ssse3 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -shuf_t0t1: - db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 -align 16 -shuf_t2t3: - db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 -align 16 -shuf_t4t5: - db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 -align 16 -shuf_t6t7: - db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 diff --git a/vp9/common/x86/vp9_subpixel_mmx.asm b/vp9/common/x86/vp9_subpixel_mmx.asm deleted file mode 100644 index dee29b8fbb1d76f89a5ee3701d247c69d282b585..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_subpixel_mmx.asm +++ /dev/null @@ -1,268 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - - -%define BLOCK_HEIGHT_WIDTH 4 -%define vp9_filter_weight 128 -%define VP9_FILTER_SHIFT 7 - - -;void vp9_filter_block1d_h6_mmx -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -global sym(vp9_filter_block1d_h6_mmx) PRIVATE -sym(vp9_filter_block1d_h6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - - movq mm1, [rdx + 16] ; do both the negative taps first!!! - movq mm2, [rdx + 32] ; - movq mm6, [rdx + 48] ; - movq mm7, [rdx + 64] ; - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - -.nextrow: - movq mm3, [rsi-2] ; mm3 = p-2..p5 - movq mm4, mm3 ; mm4 = p-2..p5 - psrlq mm3, 8 ; mm3 = p-1..p5 - punpcklbw mm3, mm0 ; mm3 = p-1..p2 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - movq mm5, mm4 ; mm5 = p-2..p5 - punpckhbw mm4, mm0 ; mm5 = p2..p5 - pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - movq mm4, mm5 ; mm4 = p-2..p5; - psrlq mm5, 16 ; mm5 = p0..p5; - punpcklbw mm5, mm0 ; mm5 = p0..p3 - pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - movq mm5, mm4 ; mm5 = p-2..p5 - psrlq mm4, 24 ; mm4 = p1..p5 - punpcklbw mm4, mm0 ; mm4 = p1..p4 - pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - ; do outer positive taps - movd mm4, [rsi+3] - punpcklbw mm4, mm0 ; mm5 = p3..p6 - pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers - paddsw mm3, mm4 ; mm3 += mm5 - - punpcklbw mm5, mm0 ; mm5 = p-2..p1 - pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers - paddsw mm3, mm5 ; mm3 += mm5 - - paddsw mm3, [GLOBAL(rd)] ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and unpack to saturate - punpcklbw mm3, mm0 ; - - movq [rdi], mm3 ; store the results in the destination - -%if ABI_IS_32BIT - add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line - add rdi, rax; -%else - movsxd r8, dword ptr arg(2) ;src_pixels_per_line - add rdi, rax; - - add rsi, r8 ; next line -%endif - - dec rcx ; decrement count - jnz .nextrow ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1dc_v6_mmx -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int output_pitch, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -global sym(vp9_filter_block1dc_v6_mmx) PRIVATE -sym(vp9_filter_block1dc_v6_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movq mm5, [GLOBAL(rd)] - push rbx - mov rbx, arg(7) ;vp9_filter - movq mm1, [rbx + 16] ; do both the negative taps first!!! - movq mm2, [rbx + 32] ; - movq mm6, [rbx + 48] ; - movq mm7, [rbx + 64] ; - - movsxd rdx, dword ptr arg(3) ;pixels_per_line - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - sub rsi, rdx - sub rsi, rdx - movsxd rcx, DWORD PTR arg(5) ;output_height - movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? - pxor mm0, mm0 ; mm0 = 00000000 - - -.nextrow_cv: - movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 - pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. - - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 - pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 - pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi] ; mm4 = p0..p3 = row -2 - pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - add rsi, rdx ; move source forward 1 line to avoid 3 * pitch - movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 - pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 - pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. - paddsw mm3, mm4 ; mm3 += mm4 - - - paddsw mm3, mm5 ; mm3 += round value - psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 - packuswb mm3, mm0 ; pack and saturate - - movd [rdi],mm3 ; store the results in the destination - ; the subsequent iterations repeat 3 out of 4 of these reads. Since the - ; recon block should be in cache this shouldn't cost much. Its obviously - ; avoidable!!!. - lea rdi, [rdi+rax] ; - dec rcx ; decrement count - jnz .nextrow_cv ; next row - - pop rbx - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -rd: - times 4 dw 0x40 - -align 16 -global HIDDEN_DATA(sym(vp9_six_tap_mmx)) -sym(vp9_six_tap_mmx): - times 8 dw 0 - times 8 dw 0 - times 8 dw 128 - times 8 dw 0 - times 8 dw 0 - times 8 dw 0 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 123 - times 8 dw 12 - times 8 dw -1 - times 8 dw 0 - - times 8 dw 2 - times 8 dw -11 - times 8 dw 108 - times 8 dw 36 - times 8 dw -8 - times 8 dw 1 - - times 8 dw 0 - times 8 dw -9 - times 8 dw 93 - times 8 dw 50 - times 8 dw -6 - times 8 dw 0 - - times 8 dw 3 - times 8 dw -16 - times 8 dw 77 - times 8 dw 77 - times 8 dw -16 - times 8 dw 3 - - times 8 dw 0 - times 8 dw -6 - times 8 dw 50 - times 8 dw 93 - times 8 dw -9 - times 8 dw 0 - - times 8 dw 1 - times 8 dw -8 - times 8 dw 36 - times 8 dw 108 - times 8 dw -11 - times 8 dw 2 - - times 8 dw 0 - times 8 dw -1 - times 8 dw 12 - times 8 dw 123 - times 8 dw -6 - times 8 dw 0 - diff --git a/vp9/common/x86/vp9_subpixel_sse2.asm b/vp9/common/x86/vp9_subpixel_sse2.asm deleted file mode 100644 index b0c4f12825cbd0e179caf55c7911693762b7eebe..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_subpixel_sse2.asm +++ /dev/null @@ -1,1372 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -;void vp9_filter_block1d8_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp9_filter -;) -global sym(vp9_filter_block1d8_h6_sse2) PRIVATE -sym(vp9_filter_block1d8_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short *vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -;*************************************************************************************/ -global sym(vp9_filter_block1d16_h6_sse2) PRIVATE -sym(vp9_filter_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 7 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(6) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;output_width -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm4 - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - punpcklbw xmm4, xmm0 - - movdqa XMMWORD Ptr [rdi+16], xmm4 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(5) ;[output_width] -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_v6_sse2 -;( -; short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; short * vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp9_filter_block1d8_v6_sse2) PRIVATE -sym(vp9_filter_block1d8_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp9_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp9_filter_block1d8_v6_sse2_loop: - movdqa xmm1, XMMWORD PTR [rsi] - pmullw xmm1, [rax] - - movdqa xmm2, XMMWORD PTR [rsi + rdx] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] - pmullw xmm3, [rax + 32] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] - pmullw xmm5, [rax + 64] - - add rsi, rdx - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] - - pmullw xmm4, [rax + 48] - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] - - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d8_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_v6_sse2 -;( -; unsigned short *src_ptr, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int pixels_per_line, -; unsigned int pixel_step, -; unsigned int output_height, -; unsigned int output_width, -; const short *vp9_filter -;) -;/************************************************************************************ -; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The -; input pixel array has output_height rows. -;*************************************************************************************/ -global sym(vp9_filter_block1d16_v6_sse2) PRIVATE -sym(vp9_filter_block1d16_v6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rax, arg(7) ;vp9_filter - movsxd rdx, dword ptr arg(3) ;pixels_per_line - - mov rdi, arg(1) ;output_ptr - mov rsi, arg(0) ;src_ptr - - sub rsi, rdx - sub rsi, rdx - - movsxd rcx, DWORD PTR arg(5) ;[output_height] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(2) ; dst_ptich -%endif - -.vp9_filter_block1d16_v6_sse2_loop: -; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. - movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 - movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] - pmullw xmm1, [rax + 16] - pmullw xmm2, [rax + 16] - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm3, [rax + 64] - pmullw xmm4, [rax + 64] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm5, [rax + 32] - pmullw xmm6, [rax + 32] - - movdqa xmm7, XMMWORD PTR [rsi] ; line 1 - movdqa xmm0, XMMWORD PTR [rsi + 16] - pmullw xmm7, [rax] - pmullw xmm0, [rax] - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - paddsw xmm1, xmm7 - paddsw xmm2, xmm0 - - add rsi, rdx - - movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 - movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] - pmullw xmm3, [rax + 48] - pmullw xmm4, [rax + 48] - - movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 - movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] - pmullw xmm5, [rax + 80] - pmullw xmm6, [rax + 80] - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] - pxor xmm0, xmm0 ; clear xmm0 - - paddsw xmm1, xmm3 - paddsw xmm2, xmm4 - paddsw xmm1, xmm5 - paddsw xmm2, xmm6 - - paddsw xmm1, xmm7 - paddsw xmm2, xmm7 - - psraw xmm1, 7 - psraw xmm2, 7 - - packuswb xmm1, xmm2 ; pack and saturate - movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(2) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d16_v6_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE -sym(vp9_filter_block1d8_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d8_h6_only_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 - - movq QWORD PTR [rdi], xmm4 ; store the results in the destination - lea rsi, [rsi + rax] - -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - dec rcx - - jnz .filter_block1d8_h6_only_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d16_h6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; First-pass filter only when yoffset==0 -global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE -sym(vp9_filter_block1d16_h6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdx, arg(5) ;vp9_filter - mov rsi, arg(0) ;src_ptr - - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ;dst_ptich -%endif - - pxor xmm0, xmm0 ; clear xmm0 for unpack - -.filter_block1d16_h6_only_sse2_rowloop: - movq xmm3, MMWORD PTR [rsi - 2] - movq xmm1, MMWORD PTR [rsi + 6] - - movq xmm2, MMWORD PTR [rsi +14] - pslldq xmm2, 8 - - por xmm2, xmm1 - prefetcht2 [rsi+rax-2] - - pslldq xmm1, 8 - por xmm1, xmm3 - - movdqa xmm4, xmm1 - movdqa xmm5, xmm1 - - movdqa xmm6, xmm1 - movdqa xmm7, xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm1 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; lower 8 bytes - - movq QWORD Ptr [rdi], xmm4 ; store the results in the destination - - movdqa xmm3, xmm2 - movdqa xmm4, xmm2 - - movdqa xmm5, xmm2 - movdqa xmm6, xmm2 - - movdqa xmm7, xmm2 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 - - pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 - punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 - - psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 - pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 - - punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 - psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 - - pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 - - punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 - psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 - - pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 - - punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 - psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 - - pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 - - punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 - pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 - - paddsw xmm4, xmm7 - paddsw xmm4, xmm5 - - paddsw xmm4, xmm3 - paddsw xmm4, xmm6 - - paddsw xmm4, xmm2 - paddsw xmm4, [GLOBAL(rd)] - - psraw xmm4, 7 - - packuswb xmm4, xmm0 ; higher 8 bytes - - movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(3) ;dst_ptich -%else - add rdi, r8 -%endif - - dec rcx - jnz .filter_block1d16_h6_only_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_filter_block1d8_v6_only_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; int dst_ptich, -; unsigned int output_height, -; const short *vp9_filter -;) -; Second-pass filter only when xoffset==0 -global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE -sym(vp9_filter_block1d8_v6_only_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - mov rax, arg(5) ;vp9_filter - - pxor xmm0, xmm0 ; clear xmm0 - - movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(3) ; dst_ptich -%endif - -.vp9_filter_block1d8_v6_only_sse2_loop: - movq xmm1, MMWORD PTR [rsi] - movq xmm2, MMWORD PTR [rsi + rdx] - movq xmm3, MMWORD PTR [rsi + rdx * 2] - movq xmm5, MMWORD PTR [rsi + rdx * 4] - add rsi, rdx - movq xmm4, MMWORD PTR [rsi + rdx * 2] - movq xmm6, MMWORD PTR [rsi + rdx * 4] - - punpcklbw xmm1, xmm0 - pmullw xmm1, [rax] - - punpcklbw xmm2, xmm0 - pmullw xmm2, [rax + 16] - - punpcklbw xmm3, xmm0 - pmullw xmm3, [rax + 32] - - punpcklbw xmm5, xmm0 - pmullw xmm5, [rax + 64] - - punpcklbw xmm4, xmm0 - pmullw xmm4, [rax + 48] - - punpcklbw xmm6, xmm0 - pmullw xmm6, [rax + 80] - - paddsw xmm2, xmm5 - paddsw xmm2, xmm3 - - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - - paddsw xmm2, xmm6 - paddsw xmm2, xmm7 - - psraw xmm2, 7 - packuswb xmm2, xmm0 ; pack and saturate - - movq QWORD PTR [rdi], xmm2 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[dst_ptich] -%else - add rdi, r8 -%endif - dec rcx ; decrement count - jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_unpack_block1d16_h6_sse2 -;( -; unsigned char *src_ptr, -; unsigned short *output_ptr, -; unsigned int src_pixels_per_line, -; unsigned int output_height, -; unsigned int output_width -;) -global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE -sym(vp9_unpack_block1d16_h6_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(1) ;output_ptr - - movsxd rcx, dword ptr arg(3) ;output_height - movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source - - pxor xmm0, xmm0 ; clear xmm0 for unpack -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source -%endif - -.unpack_block1d16_h6_sse2_rowloop: - movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 - movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 - - punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 - punpcklbw xmm1, xmm0 - - movdqa XMMWORD Ptr [rdi], xmm1 - movdqa XMMWORD Ptr [rdi + 16], xmm3 - - lea rsi, [rsi + rax] -%if ABI_IS_32BIT - add rdi, DWORD Ptr arg(4) ;[output_width] -%else - add rdi, r8 -%endif - dec rcx - jnz .unpack_block1d16_h6_sse2_rowloop ; next row - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_bilinear_predict16x16_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp9_bilinear_filters_mmx) -global sym(vp9_bilinear_predict16x16_sse2) PRIVATE -sym(vp9_bilinear_predict16x16_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] - - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] - movsxd rax, dword ptr arg(2) ;xoffset - - cmp rax, 0 ;skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - - cmp rax, 0 ;skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ;dst_pitch -%endif - ; get the first horizontal line done - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - add rsi, rdx ; next line -.next_row: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, [rax] - pmullw xmm6, [rax] - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 - - pmullw xmm3, [rax+16] - pmullw xmm4, [rax+16] - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rdx ; next line -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ;dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - mov rdi, arg(4) ;dst_ptr - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - add rsi, rax ; next line -.next_row_spo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - - movdqa xmm5, xmm7 - movdqa xmm6, xmm7 - - movdqa xmm4, xmm3 ; make a copy of current line - movdqa xmm7, xmm3 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm5, xmm1 - pmullw xmm6, xmm1 - pmullw xmm3, xmm2 - pmullw xmm4, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ;dst_pitch - cmp rdi, rcx - jne .next_row_spo - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - pxor xmm0, xmm0 - -.next_row_fpo: - movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 - movdqa xmm4, xmm3 ; make a copy of current line - - punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 - punpckhbw xmm4, xmm0 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm1 - - movdqu xmm5, [rsi+1] - movdqa xmm6, xmm5 - - punpcklbw xmm5, xmm0 - punpckhbw xmm6, xmm0 - - pmullw xmm5, xmm2 - pmullw xmm6, xmm2 - - paddw xmm3, xmm5 - paddw xmm4, xmm6 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - packuswb xmm3, xmm4 - movdqa [rdi], xmm3 ; store the results in the destination - - add rsi, rax ; next line - add rdi, rdx ; dst_pitch - cmp rdi, rcx - jne .next_row_fpo - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_bilinear_predict8x8_sse2 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -extern sym(vp9_bilinear_filters_mmx) -global sym(vp9_bilinear_predict8x8_sse2) PRIVATE -sym(vp9_bilinear_predict8x8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - ;const short *HFilter = bilinear_filters_mmx[xoffset] - ;const short *VFilter = bilinear_filters_mmx[yoffset] - lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ;xoffset - shl rax, 5 - add rax, rcx ;HFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ;dst_pitch - - movdqa xmm1, [rax] - movdqa xmm2, [rax+16] - - movsxd rax, dword ptr arg(3) ;yoffset - shl rax, 5 - add rax, rcx ;VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm5, [rax] - movdqa xmm6, [rax+16] - - pxor xmm0, xmm0 - - ; get the first horizontal line done - movdqa xmm3, XMMWORD PTR [rsp] - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - add rsp, 16 ; next line -.next_row8x8: - movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm4, xmm3 ; make a copy of current line - psrldq xmm4, 1 - - punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 - punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 - - pmullw xmm3, xmm1 - pmullw xmm4, xmm2 - - paddw xmm3, xmm4 - pmullw xmm7, xmm5 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm4, xmm3 - - pmullw xmm3, xmm6 - paddw xmm3, xmm7 - - movdqa xmm7, xmm4 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - packuswb xmm3, xmm0 - movq [rdi], xmm3 ; store the results in the destination - - add rsp, 16 ; next line - add rdi, rdx - - cmp rdi, rcx - jne .next_row8x8 - - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -rd: - times 8 dw 0x40 diff --git a/vp9/common/x86/vp9_subpixel_ssse3.asm b/vp9/common/x86/vp9_subpixel_ssse3.asm deleted file mode 100644 index b260480e0364eafc957ea561fcf294a6d22e165c..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_subpixel_ssse3.asm +++ /dev/null @@ -1,1515 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define BLOCK_HEIGHT_WIDTH 4 -%define VP9_FILTER_WEIGHT 128 -%define VP9_FILTER_SHIFT 7 - - -;/************************************************************************************ -; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The -; input pixel array has output_height rows. This routine assumes that output_height is an -; even number. This function handles 8 pixels in horizontal direction, calculating ONE -; rows each iteration to take advantage of the 128 bits operations. -; -; This is an implementation of some of the SSE optimizations first seen in ffvp8 -; -;*************************************************************************************/ -;void vp9_filter_block1d8_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE -sym(vp9_filter_block1d8_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 - - movdqa xmm7, [GLOBAL(rd)] - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - mov rdi, arg(2) ;output_ptr - - cmp esi, DWORD PTR [rax] - je vp9_filter_block1d8_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx -;xmm3 free -.filter_block1d8_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - pmaddubsw xmm1, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm1 - paddsw xmm2, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - jnz .filter_block1d8_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -vp9_filter_block1d8_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] - movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] - - mov rsi, arg(0) ;src_ptr - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - - sub rdi, rdx - -.filter_block1d8_h4_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm2, xmm0 - pshufb xmm0, xmm3 - - pshufb xmm2, xmm4 - pmaddubsw xmm0, xmm5 - - lea rdi, [rdi + rdx] - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] - dec rcx - - paddsw xmm0, xmm7 - - paddsw xmm0, xmm2 - - psraw xmm0, 7 - - packuswb xmm0, xmm0 - - movq MMWORD Ptr [rdi], xmm0 - - jnz .filter_block1d8_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp9_filter_block1d16_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE -sym(vp9_filter_block1d16_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - mov rdi, arg(2) ;output_ptr - - mov rsi, arg(0) ;src_ptr - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d16_h6_rowloop_ssse3: - movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 - - movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 - - punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 - - movdqa xmm1, xmm0 - pmaddubsw xmm0, xmm4 - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - movq xmm3, MMWORD PTR [rsi + 6] - - pmaddubsw xmm1, xmm5 - movq xmm7, MMWORD PTR [rsi + 11] - - pmaddubsw xmm2, xmm6 - punpcklbw xmm3, xmm7 - - paddsw xmm0, xmm1 - movdqa xmm1, xmm3 - - pmaddubsw xmm3, xmm4 - paddsw xmm0, xmm2 - - movdqa xmm2, xmm1 - paddsw xmm0, [GLOBAL(rd)] - - pshufb xmm1, [GLOBAL(shuf2bfrom1)] - pshufb xmm2, [GLOBAL(shuf3bfrom1)] - - psraw xmm0, 7 - pmaddubsw xmm1, xmm5 - - pmaddubsw xmm2, xmm6 - packuswb xmm0, xmm0 - - lea rsi, [rsi + rax] - paddsw xmm3, xmm1 - - paddsw xmm3, xmm2 - - paddsw xmm3, [GLOBAL(rd)] - - psraw xmm3, 7 - - packuswb xmm3, xmm3 - - punpcklqdq xmm0, xmm3 - - movdqa XMMWORD Ptr [rdi], xmm0 - - lea rdi, [rdi + rdx] - dec rcx - jnz .filter_block1d16_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d4_h6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE -sym(vp9_filter_block1d4_h6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - movdqa xmm7, [GLOBAL(rd)] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d4_h4_ssse3 - - movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -;xmm3 free -.filter_block1d4_h6_rowloop_ssse3: - movdqu xmm0, XMMWORD PTR [rsi - 2] - - movdqa xmm1, xmm0 - pshufb xmm0, [GLOBAL(shuf1b)] - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2b)] - pmaddubsw xmm0, xmm4 - pshufb xmm2, [GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm0, xmm1 - paddsw xmm0, xmm7 - pxor xmm1, xmm1 - paddsw xmm0, xmm2 - psraw xmm0, 7 - packuswb xmm0, xmm0 - - movd DWORD PTR [rdi], xmm0 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h6_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d4_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] - movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - - movsxd rdx, dword ptr arg(3) ;output_pitch - -.filter_block1d4_h4_rowloop_ssse3: - movdqu xmm1, XMMWORD PTR [rsi - 2] - - movdqa xmm2, xmm1 - pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] - pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - -;-- - pmaddubsw xmm2, xmm6 - - lea rsi, [rsi + rax] -;-- - paddsw xmm1, xmm7 - paddsw xmm1, xmm2 - psraw xmm1, 7 - packuswb xmm1, xmm1 - - movd DWORD PTR [rdi], xmm1 - - add rdi, rdx - dec rcx - jnz .filter_block1d4_h4_rowloop_ssse3 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - -;void vp9_filter_block1d16_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE -sym(vp9_filter_block1d16_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d16_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - - -.vp9_filter_block1d16_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 ;store the results - - movq xmm1, MMWORD PTR [rsi + 8] ;A - movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, [GLOBAL(rd)] - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi+8], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d16_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr - -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ;out_pitch -%endif - mov rax, rsi - movsxd rcx, DWORD PTR arg(4) ;output_height - add rax, rdx - -.vp9_filter_block1d16_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B - movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E - - paddsw xmm2, [GLOBAL(rd)] - paddsw xmm2, xmm3 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - punpcklbw xmm5, xmm4 ;B D - punpcklbw xmm1, xmm0 ;C E - - pmaddubsw xmm1, xmm6 - pmaddubsw xmm5, xmm7 - - movdqa xmm4, [GLOBAL(rd)] - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm5, xmm1 - paddsw xmm5, xmm4 - psraw xmm5, 7 - packuswb xmm5, xmm5 - - punpcklqdq xmm2, xmm5 - - movdqa XMMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;out_pitch -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d16_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d8_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE -sym(vp9_filter_block1d8_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d8_v4_ssse3 - - movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d8_v6_ssse3_loop: - movq xmm1, MMWORD PTR [rsi] ;A - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - movq xmm0, MMWORD PTR [rax + rdx * 4] ;F - movdqa xmm4, [GLOBAL(rd)] - - pmaddubsw xmm3, xmm6 - punpcklbw xmm1, xmm0 ;A F - pmaddubsw xmm2, xmm7 - pmaddubsw xmm1, xmm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm1 - paddsw xmm2, xmm4 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d8_v4_ssse3: - movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 - movdqa xmm5, [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d8_v4_ssse3_loop: - movq xmm2, MMWORD PTR [rsi + rdx] ;B - movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C - movq xmm4, MMWORD PTR [rax + rdx * 2] ;D - movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E - - punpcklbw xmm2, xmm4 ;B D - punpcklbw xmm3, xmm0 ;C E - - pmaddubsw xmm3, xmm6 - pmaddubsw xmm2, xmm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw xmm2, xmm3 - paddsw xmm2, xmm5 - psraw xmm2, 7 - packuswb xmm2, xmm2 - - movq MMWORD PTR [rdi], xmm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d8_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -;void vp9_filter_block1d4_v6_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; unsigned int vp9_filter_index -;) -global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE -sym(vp9_filter_block1d4_v6_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - movsxd rdx, DWORD PTR arg(5) ;table index - xor rsi, rsi - shl rdx, 4 ; - - lea rax, [GLOBAL(k0_k5)] - add rax, rdx - - movsxd rdx, DWORD PTR arg(1) ;pixels_per_line - mov rdi, arg(2) ;output_ptr -%if ABI_IS_32BIT=0 - movsxd r8, DWORD PTR arg(3) ; out_pitch -%endif - movsxd rcx, DWORD PTR arg(4) ;[output_height] - - cmp esi, DWORD PTR [rax] - je .vp9_filter_block1d4_v4_ssse3 - - movq mm5, MMWORD PTR [rax] ;k0_k5 - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d4_v6_ssse3_loop: - movd mm1, DWORD PTR [rsi] ;A - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - movd mm0, DWORD PTR [rax + rdx * 4] ;F - - movq mm4, [GLOBAL(rd)] - - pmaddubsw mm3, mm6 - punpcklbw mm1, mm0 ;A F - pmaddubsw mm2, mm7 - pmaddubsw mm1, mm5 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm1 - paddsw mm2, mm4 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d4_v6_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -.vp9_filter_block1d4_v4_ssse3: - movq mm6, MMWORD PTR [rax+256] ;k2_k4 - movq mm7, MMWORD PTR [rax+128] ;k1_k3 - movq mm5, MMWORD PTR [GLOBAL(rd)] - - mov rsi, arg(0) ;src_ptr - - mov rax, rsi - add rax, rdx - -.vp9_filter_block1d4_v4_ssse3_loop: - movd mm2, DWORD PTR [rsi + rdx] ;B - movd mm3, DWORD PTR [rsi + rdx * 2] ;C - movd mm4, DWORD PTR [rax + rdx * 2] ;D - movd mm0, DWORD PTR [rsi + rdx * 4] ;E - - punpcklbw mm2, mm4 ;B D - punpcklbw mm3, mm0 ;C E - - pmaddubsw mm3, mm6 - pmaddubsw mm2, mm7 - add rsi, rdx - add rax, rdx -;-- -;-- - paddsw mm2, mm3 - paddsw mm2, mm5 - psraw mm2, 7 - packuswb mm2, mm2 - - movd DWORD PTR [rdi], mm2 - -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(3) ;[out_pitch] -%else - add rdi, r8 -%endif - dec rcx - jnz .vp9_filter_block1d4_v4_ssse3_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_bilinear_predict16x16_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE -sym(vp9_bilinear_predict16x16_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - movsxd rax, dword ptr arg(2) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b16x16_sp_only - - shl rax, 4 - lea rax, [rax + rcx] ; HFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b16x16_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rdx, dword ptr arg(1) ; src_pixels_per_line - - movdqa xmm2, [rax] - -%if ABI_IS_32BIT=0 - movsxd r8, dword ptr arg(5) ; dst_pitch -%endif - movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 - - punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 - pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm6, xmm5 - movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 - - movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 - lea rsi, [rsi + rdx] ; next line - - pmaddubsw xmm6, xmm1 - - punpcklbw xmm4, xmm5 - pmaddubsw xmm4, xmm1 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 - - paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value - psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 - - packuswb xmm6, xmm4 - movdqa xmm5, xmm7 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm2 - - punpckhbw xmm7, xmm6 - pmaddubsw xmm7, xmm2 - - paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value - psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm5, xmm7 - movdqa xmm7, xmm6 - - movdqa [rdi], xmm5 ; store the results in the destination -%if ABI_IS_32BIT - add rdi, DWORD PTR arg(5) ; dst_pitch -%else - add rdi, r8 -%endif - - cmp rdi, rcx - jne .next_row - - jmp .done - -.b16x16_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ; dst_ptr - mov rsi, arg(0) ; src_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm1, [rax] ; VFilter - - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - - ; get the first horizontal line done - movq xmm4, [rsi] ; load row 0 - movq xmm2, [rsi + 8] ; load row 0 - - lea rsi, [rsi + rax] ; next line -.next_row_sp: - movq xmm3, [rsi] ; load row + 1 - movq xmm5, [rsi + 8] ; load row + 1 - - punpcklbw xmm4, xmm3 - punpcklbw xmm2, xmm5 - - pmaddubsw xmm4, xmm1 - movq xmm7, [rsi + rax] ; load row + 2 - - pmaddubsw xmm2, xmm1 - movq xmm6, [rsi + rax + 8] ; load row + 2 - - punpcklbw xmm3, xmm7 - punpcklbw xmm5, xmm6 - - pmaddubsw xmm3, xmm1 - paddw xmm4, [GLOBAL(rd)] - - pmaddubsw xmm5, xmm1 - paddw xmm2, [GLOBAL(rd)] - - psraw xmm4, VP9_FILTER_SHIFT - psraw xmm2, VP9_FILTER_SHIFT - - packuswb xmm4, xmm2 - paddw xmm3, [GLOBAL(rd)] - - movdqa [rdi], xmm4 ; store row 0 - paddw xmm5, [GLOBAL(rd)] - - psraw xmm3, VP9_FILTER_SHIFT - psraw xmm5, VP9_FILTER_SHIFT - - packuswb xmm3, xmm5 - movdqa xmm4, xmm7 - - movdqa [rdi + rdx],xmm3 ; store row 1 - lea rsi, [rsi + 2*rax] - - movdqa xmm2, xmm6 - lea rdi, [rdi + 2*rdx] - - cmp rdi, rcx - jne .next_row_sp - - jmp .done - -.b16x16_fp_only: - lea rcx, [rdi+rdx*8] - lea rcx, [rcx+rdx*8] - movsxd rax, dword ptr arg(1) ; src_pixels_per_line - -.next_row_fp: - movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 - movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 - - punpcklbw xmm2, xmm4 - movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 - - pmaddubsw xmm2, xmm1 - movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 - - lea rsi, [rsi + rax] ; next line - punpcklbw xmm3, xmm4 - - pmaddubsw xmm3, xmm1 - movq xmm5, [rsi] - - paddw xmm2, [GLOBAL(rd)] - movq xmm7, [rsi+1] - - movq xmm6, [rsi+8] - psraw xmm2, VP9_FILTER_SHIFT - - punpcklbw xmm5, xmm7 - movq xmm7, [rsi+9] - - paddw xmm3, [GLOBAL(rd)] - pmaddubsw xmm5, xmm1 - - psraw xmm3, VP9_FILTER_SHIFT - punpcklbw xmm6, xmm7 - - packuswb xmm2, xmm3 - pmaddubsw xmm6, xmm1 - - movdqa [rdi], xmm2 ; store the results in the destination - paddw xmm5, [GLOBAL(rd)] - - lea rdi, [rdi + rdx] ; dst_pitch - psraw xmm5, VP9_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm6, VP9_FILTER_SHIFT - - packuswb xmm5, xmm6 - lea rsi, [rsi + rax] ; next line - - movdqa [rdi], xmm5 ; store the results in the destination - lea rdi, [rdi + rdx] ; dst_pitch - - cmp rdi, rcx - - jne .next_row_fp - -.done: - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_bilinear_predict8x8_ssse3 -;( -; unsigned char *src_ptr, -; int src_pixels_per_line, -; int xoffset, -; int yoffset, -; unsigned char *dst_ptr, -; int dst_pitch -;) -global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE -sym(vp9_bilinear_predict8x8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 144 ; reserve 144 bytes - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - - mov rsi, arg(0) ;src_ptr - movsxd rdx, dword ptr arg(1) ;src_pixels_per_line - - ;Read 9-line unaligned data in and put them on stack. This gives a big - ;performance boost. - movdqu xmm0, [rsi] - lea rax, [rdx + rdx*2] - movdqu xmm1, [rsi+rdx] - movdqu xmm2, [rsi+rdx*2] - add rsi, rax - movdqu xmm3, [rsi] - movdqu xmm4, [rsi+rdx] - movdqu xmm5, [rsi+rdx*2] - add rsi, rax - movdqu xmm6, [rsi] - movdqu xmm7, [rsi+rdx] - - movdqa XMMWORD PTR [rsp], xmm0 - - movdqu xmm0, [rsi+rdx*2] - - movdqa XMMWORD PTR [rsp+16], xmm1 - movdqa XMMWORD PTR [rsp+32], xmm2 - movdqa XMMWORD PTR [rsp+48], xmm3 - movdqa XMMWORD PTR [rsp+64], xmm4 - movdqa XMMWORD PTR [rsp+80], xmm5 - movdqa XMMWORD PTR [rsp+96], xmm6 - movdqa XMMWORD PTR [rsp+112], xmm7 - movdqa XMMWORD PTR [rsp+128], xmm0 - - movsxd rax, dword ptr arg(2) ; xoffset - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .b8x8_sp_only - - shl rax, 4 - add rax, rcx ; HFilter - - mov rdi, arg(4) ; dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] - - movsxd rax, dword ptr arg(3) ; yoffset - cmp rax, 0 ; skip second_pass filter if yoffset=0 - je .b8x8_fp_only - - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - lea rcx, [rdi+rdx*8] - - movdqa xmm1, [rax] - - ; get the first horizontal line done - movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx - - psrldq xmm5, 1 - lea rsp, [rsp + 16] ; next line - - punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 - pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 - - paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value - psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 - - movdqa xmm7, xmm3 - packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - -.next_row: - movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 - lea rsp, [rsp + 16] ; next line - - movdqa xmm5, xmm6 - - psrldq xmm5, 1 - - punpcklbw xmm6, xmm5 - pmaddubsw xmm6, xmm0 - - paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value - psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 - - packuswb xmm6, xmm6 - - punpcklbw xmm7, xmm6 - pmaddubsw xmm7, xmm1 - - paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value - psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 - - packuswb xmm7, xmm7 - - movq [rdi], xmm7 ; store the results in the destination - lea rdi, [rdi + rdx] - - movdqa xmm7, xmm6 - - cmp rdi, rcx - jne .next_row - - jmp .done8x8 - -.b8x8_sp_only: - movsxd rax, dword ptr arg(3) ; yoffset - shl rax, 4 - lea rax, [rax + rcx] ; VFilter - - mov rdi, arg(4) ;dst_ptr - movsxd rdx, dword ptr arg(5) ; dst_pitch - - movdqa xmm0, [rax] ; VFilter - - movq xmm1, XMMWORD PTR [rsp] - movq xmm2, XMMWORD PTR [rsp+16] - - movq xmm3, XMMWORD PTR [rsp+32] - punpcklbw xmm1, xmm2 - - movq xmm4, XMMWORD PTR [rsp+48] - punpcklbw xmm2, xmm3 - - movq xmm5, XMMWORD PTR [rsp+64] - punpcklbw xmm3, xmm4 - - movq xmm6, XMMWORD PTR [rsp+80] - punpcklbw xmm4, xmm5 - - movq xmm7, XMMWORD PTR [rsp+96] - punpcklbw xmm5, xmm6 - - pmaddubsw xmm1, xmm0 - pmaddubsw xmm2, xmm0 - - pmaddubsw xmm3, xmm0 - pmaddubsw xmm4, xmm0 - - pmaddubsw xmm5, xmm0 - punpcklbw xmm6, xmm7 - - pmaddubsw xmm6, xmm0 - paddw xmm1, [GLOBAL(rd)] - - paddw xmm2, [GLOBAL(rd)] - psraw xmm1, VP9_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm2, VP9_FILTER_SHIFT - - paddw xmm4, [GLOBAL(rd)] - psraw xmm3, VP9_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm4, VP9_FILTER_SHIFT - - paddw xmm6, [GLOBAL(rd)] - psraw xmm5, VP9_FILTER_SHIFT - - psraw xmm6, VP9_FILTER_SHIFT - packuswb xmm1, xmm1 - - packuswb xmm2, xmm2 - movq [rdi], xmm1 - - packuswb xmm3, xmm3 - movq [rdi+rdx], xmm2 - - packuswb xmm4, xmm4 - movq xmm1, XMMWORD PTR [rsp+112] - - lea rdi, [rdi + 2*rdx] - movq xmm2, XMMWORD PTR [rsp+128] - - packuswb xmm5, xmm5 - movq [rdi], xmm3 - - packuswb xmm6, xmm6 - movq [rdi+rdx], xmm4 - - lea rdi, [rdi + 2*rdx] - punpcklbw xmm7, xmm1 - - movq [rdi], xmm5 - pmaddubsw xmm7, xmm0 - - movq [rdi+rdx], xmm6 - punpcklbw xmm1, xmm2 - - pmaddubsw xmm1, xmm0 - paddw xmm7, [GLOBAL(rd)] - - psraw xmm7, VP9_FILTER_SHIFT - paddw xmm1, [GLOBAL(rd)] - - psraw xmm1, VP9_FILTER_SHIFT - packuswb xmm7, xmm7 - - packuswb xmm1, xmm1 - lea rdi, [rdi + 2*rdx] - - movq [rdi], xmm7 - - movq [rdi+rdx], xmm1 - lea rsp, [rsp + 144] - - jmp .done8x8 - -.b8x8_fp_only: - lea rcx, [rdi+rdx*8] - -.next_row_fp: - movdqa xmm1, XMMWORD PTR [rsp] - movdqa xmm3, XMMWORD PTR [rsp+16] - - movdqa xmm2, xmm1 - movdqa xmm5, XMMWORD PTR [rsp+32] - - psrldq xmm2, 1 - movdqa xmm7, XMMWORD PTR [rsp+48] - - movdqa xmm4, xmm3 - psrldq xmm4, 1 - - movdqa xmm6, xmm5 - psrldq xmm6, 1 - - punpcklbw xmm1, xmm2 - pmaddubsw xmm1, xmm0 - - punpcklbw xmm3, xmm4 - pmaddubsw xmm3, xmm0 - - punpcklbw xmm5, xmm6 - pmaddubsw xmm5, xmm0 - - movdqa xmm2, xmm7 - psrldq xmm2, 1 - - punpcklbw xmm7, xmm2 - pmaddubsw xmm7, xmm0 - - paddw xmm1, [GLOBAL(rd)] - psraw xmm1, VP9_FILTER_SHIFT - - paddw xmm3, [GLOBAL(rd)] - psraw xmm3, VP9_FILTER_SHIFT - - paddw xmm5, [GLOBAL(rd)] - psraw xmm5, VP9_FILTER_SHIFT - - paddw xmm7, [GLOBAL(rd)] - psraw xmm7, VP9_FILTER_SHIFT - - packuswb xmm1, xmm1 - packuswb xmm3, xmm3 - - packuswb xmm5, xmm5 - movq [rdi], xmm1 - - packuswb xmm7, xmm7 - movq [rdi+rdx], xmm3 - - lea rdi, [rdi + 2*rdx] - movq [rdi], xmm5 - - lea rsp, [rsp + 4*16] - movq [rdi+rdx], xmm7 - - lea rdi, [rdi + 2*rdx] - cmp rdi, rcx - - jne .next_row_fp - - lea rsp, [rsp + 16] - -.done8x8: - ;add rsp, 144 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -shuf1b: - db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 -shuf2b: - db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 -shuf3b: - db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 - -align 16 -shuf2bfrom1: - db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 -align 16 -shuf3bfrom1: - db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 - -align 16 -rd: - times 8 dw 0x40 - -align 16 -k0_k5: - times 8 db 0, 0 ;placeholder - times 8 db 0, 0 - times 8 db 2, 1 - times 8 db 0, 0 - times 8 db 3, 3 - times 8 db 0, 0 - times 8 db 1, 2 - times 8 db 0, 0 -k1_k3: - times 8 db 0, 0 ;placeholder - times 8 db -6, 12 - times 8 db -11, 36 - times 8 db -9, 50 - times 8 db -16, 77 - times 8 db -6, 93 - times 8 db -8, 108 - times 8 db -1, 123 -k2_k4: - times 8 db 128, 0 ;placeholder - times 8 db 123, -1 - times 8 db 108, -8 - times 8 db 93, -6 - times 8 db 77, -16 - times 8 db 50, -9 - times 8 db 36, -11 - times 8 db 12, -6 -align 16 -bilinear_filters_ssse3: - times 8 db 128, 0 - times 8 db 120, 8 - times 8 db 112, 16 - times 8 db 104, 24 - times 8 db 96, 32 - times 8 db 88, 40 - times 8 db 80, 48 - times 8 db 72, 56 - times 8 db 64, 64 - times 8 db 56, 72 - times 8 db 48, 80 - times 8 db 40, 88 - times 8 db 32, 96 - times 8 db 24, 104 - times 8 db 16, 112 - times 8 db 8, 120 - diff --git a/vp9/common/x86/vp9_subpixel_x86.h b/vp9/common/x86/vp9_subpixel_x86.h deleted file mode 100644 index 25bc26d9bddc50a69d8288c2466dc21f8649de24..0000000000000000000000000000000000000000 --- a/vp9/common/x86/vp9_subpixel_x86.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_ -#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_ - -/* Note: - * - * This platform is commonly built for runtime CPU detection. If you modify - * any of the function mappings present in this file, be sure to also update - * them in the function pointer initialization code - */ - -#if HAVE_MMX -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx); -extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx - -#endif -#endif - - -#if HAVE_SSE2 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2); - - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2 - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2 - -#endif -#endif - -#if HAVE_SSSE3 -extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3); -extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3); -extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3); -extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3); - -#if !CONFIG_RUNTIME_CPU_DETECT -#undef vp9_subpix_sixtap16x16 -#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3 - -#undef vp9_subpix_sixtap8x8 -#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3 - -#undef vp9_subpix_sixtap8x4 -#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3 - -#undef vp9_subpix_sixtap4x4 -#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3 - - -#undef vp9_subpix_bilinear16x16 -#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3 - -#undef vp9_subpix_bilinear8x8 -#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3 - -#endif -#endif - - - -#endif diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 4a0794a9b24b9c48b52b273499bc1c325b41914f..1bbf95468f52b31161e7156f7957de1493c08421 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -11,6 +11,7 @@ #include "vpx_config.h" #include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_reconinter.h" #include "vp9/encoder/vp9_onyx_int.h" #include "vp9/common/vp9_systemdependent.h" #include "vp9/encoder/vp9_quantize.h" @@ -3953,6 +3954,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, cm->fb_idx_ref_cnt[cm->new_fb_idx]--; cm->new_fb_idx = get_free_fb(cm); + vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm); if (cpi->pass == 1) { Pass1Encode(cpi, size, dest, frame_flags); } else if (cpi->pass == 2) { diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 2e9bbcfc1e25db82fcf94e188dc7fe8d4803cfa2..200a6a947337a0ac2836ec053a3fd28330ade53e 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2222,9 +2222,9 @@ static int64_t encode_inter_mb_segment(MACROBLOCK *x, BLOCK *be = &x->block[i]; int thisdistortion; - vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict4x4); + vp9_build_inter_predictors_b(bd, 16, &xd->subpix); if (xd->mode_info_context->mbmi.second_ref_frame > 0) - vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg4x4); + vp9_build_2nd_inter_predictors_b(bd, 16, &xd->subpix); vp9_subtract_b(be, bd, 16); x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32); x->quantize_b_4x4(be, bd); diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 8bbe534860c99df912bbc75f14329cccf6d3e1ce..7bca01e0514a93374dcb524026146215c5733541 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -50,12 +50,11 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, // Y yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3); - if ((mv_row | mv_col) & 7) { - xd->subpixel_predict16x16(yptr, stride, - (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16); - } else { - vp9_copy_mem16x16(yptr, stride, &pred[0], 16); - } + xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][0]( + yptr, stride, &pred[0], 16, + xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4, + xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4, + 16, 16); // U & V omv_row = mv_row; @@ -67,15 +66,17 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd, uptr = u_mb_ptr + offset; vptr = v_mb_ptr + offset; - if ((omv_row | omv_col) & 15) { - xd->subpixel_predict8x8(uptr, stride, - (omv_col & 15), (omv_row & 15), &pred[256], 8); - xd->subpixel_predict8x8(vptr, stride, - (omv_col & 15), (omv_row & 15), &pred[320], 8); - } else { - vp9_copy_mem8x8(uptr, stride, &pred[256], 8); - vp9_copy_mem8x8(vptr, stride, &pred[320], 8); - } + xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0]( + uptr, stride, &pred[256], 8, + xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4, + xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4, + 8, 8); + + xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0]( + vptr, stride, &pred[320], 8, + xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4, + xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4, + 8, 8); } void vp9_temporal_filter_apply_c(uint8_t *frame1, diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c index d03e285c6340f43a12dcf7fb2bca23a96b4db28e..d07a65b4551c8171c4946ead30d7c5c375fa3e97 100644 --- a/vp9/encoder/vp9_variance_c.c +++ b/vp9/encoder/vp9_variance_c.c @@ -142,8 +142,8 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, const int16_t *HFilter, *VFilter; uint16_t FData3[5 * 4]; // Temp data bufffer used in filtering - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); // First filter 1d Horizontal var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter); @@ -166,8 +166,8 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter); @@ -186,8 +186,8 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter); @@ -206,8 +206,8 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, uint8_t temp2[68 * 64]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 65, 64, HFilter); @@ -227,8 +227,8 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, uint8_t temp2[36 * 32]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter); @@ -367,8 +367,8 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter); var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter); @@ -387,8 +387,8 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, uint8_t temp2[20 * 16]; const int16_t *HFilter, *VFilter; - HFilter = vp9_bilinear_filters[xoffset]; - VFilter = vp9_bilinear_filters[yoffset]; + HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter); diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index d1805be62b069d298d99116ff3cc664862037a42..c7e8acb454dbd38d2f918cd2cc86d6471e765544 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -56,7 +56,6 @@ VP9_COMMON_SRCS-yes += common/vp9_subpelvar.h VP9_COMMON_SRCS-yes += common/vp9_seg_common.h VP9_COMMON_SRCS-yes += common/vp9_seg_common.c VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h -VP9_COMMON_SRCS-yes += common/vp9_subpixel.h VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h VP9_COMMON_SRCS-yes += common/vp9_textblit.h @@ -81,7 +80,6 @@ VP9_COMMON_SRCS-yes += common/vp9_treecoder.c VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.h -VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_subpixel_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c @@ -90,7 +88,6 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm -VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_subpixel_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm @@ -98,10 +95,7 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm @@ -113,19 +107,10 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm endif -VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_filter_sse4.c -ifeq ($(HAVE_SSE4_1),yes) -vp9/common/x86/vp9_filter_sse4.c.o: CFLAGS += -msse4 -vp9/common/x86/vp9_filter_sse4.c.d: CFLAGS += -msse4 -endif - -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_filter_sse2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c ifeq ($(HAVE_SSE2),yes) -vp9/common/x86/vp9_filter_sse2.c.o: CFLAGS += -msse2 vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2 vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2 -vp9/common/x86/vp9_filter_sse2.c.d: CFLAGS += -msse2 vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2 vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2 endif