• Yunqing Wang's avatar
    Preload reference area to an intermediate buffer in sub-pixel motion search · 20bd1446
    Yunqing Wang authored
    In sub-pixel motion search, the search range is small(+/- 3 pixels).
    Preload whole search area from reference buffer into a 32-byte
    aligned buffer. Then in search, load reference data from this buffer
    instead. This keeps data in cache, and reduces the crossing cache-
    line penalty. For tulip clip, tests on Intel Core2 Quad machine(linux)
    showed encoder speed improvement:
      3.4%   at --rt --cpu-used =-4
      2.8%   at --rt --cpu-used =-3
      2.3%   at --rt --cpu-used =-2
      2.2%   at --rt --cpu-used =-1
    
    Test on Atom notebook showed only 1.1% speed improvement(speed=-4).
    Test on Xeon machine also showed less improvement, since unaligned
    data access latency is greatly reduced in newer cores.
    
    Next, I will apply similar idea to other 2 sub-pixel search functions
    for encoding speed > 4.
    
    Make this change exclusively for x86 platforms.
    
    Change-Id: Ia7bb9f56169eac0f01009fe2b2f2ab5b61d2eb2f
    20bd1446
sad_c.c 13.89 KiB
/*
 *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
#include <stdlib.h>
#include "vpx_ports/config.h"
#include "vpx/vpx_integer.h"
unsigned int vp8_sad16x16_c(
    const unsigned char *src_ptr,
    int  src_stride,
    const unsigned char *ref_ptr,
    int  ref_stride,
    int max_sad)
    int r, c;
    unsigned int sad = 0;
    for (r = 0; r < 16; r++)
        for (c = 0; c < 16; c++)
            sad += abs(src_ptr[c] - ref_ptr[c]);
        src_ptr += src_stride;
        ref_ptr += ref_stride;
    return sad;
static __inline
unsigned int sad_mx_n_c(
    const unsigned char *src_ptr,
    int  src_stride,
    const unsigned char *ref_ptr,
    int  ref_stride,
    int m,
    int n)
    int r, c;
    unsigned int sad = 0;
    for (r = 0; r < n; r++)
        for (c = 0; c < m; c++)
            sad += abs(src_ptr[c] - ref_ptr[c]);
        src_ptr += src_stride;
        ref_ptr += ref_stride;
    return sad;
unsigned int vp8_sad8x8_c(
7172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8); } unsigned int vp8_sad16x8_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8); } unsigned int vp8_sad8x16_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16); } unsigned int vp8_sad4x4_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad) { return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4); } void vp8_sad16x16x3_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array ) { sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); } void vp8_sad16x16x8_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array ) { sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff); sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff); sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff); sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff); sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff); } void vp8_sad16x8x3_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array ) { sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); } void vp8_sad16x8x8_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array ) { sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff); sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff); sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff); sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff); sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff); } void vp8_sad8x8x3_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array ) { sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); } void vp8_sad8x8x8_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array ) { sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff); sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff); sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff); sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff); sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff); }
211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
void vp8_sad8x16x3_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array ) { sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); } void vp8_sad8x16x8_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array ) { sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff); sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff); sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff); sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff); sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff); } void vp8_sad4x4x3_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sad_array ) { sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); } void vp8_sad4x4x8_c( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride, unsigned short *sad_array ) { sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff); sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff); sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff); sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff); sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff); } void vp8_sad16x16x4d_c( const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array ) {
281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff); sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff); sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); } void vp8_sad16x8x4d_c( const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array ) { sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff); sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff); sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); } void vp8_sad8x8x4d_c( const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array ) { sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff); sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff); sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); } void vp8_sad8x16x4d_c( const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array ) { sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff); sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff); sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); } void vp8_sad4x4x4d_c( const unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr[], int ref_stride, unsigned int *sad_array ) { sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff); sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff); sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff); sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff); } /* Copy 2 macroblocks to a buffer */ void vp8_copy32xn_c( unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height) {
351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
int r; for (r = 0; r < height; r++) { #if !(CONFIG_FAST_UNALIGNED) dst_ptr[0] = src_ptr[0]; dst_ptr[1] = src_ptr[1]; dst_ptr[2] = src_ptr[2]; dst_ptr[3] = src_ptr[3]; dst_ptr[4] = src_ptr[4]; dst_ptr[5] = src_ptr[5]; dst_ptr[6] = src_ptr[6]; dst_ptr[7] = src_ptr[7]; dst_ptr[8] = src_ptr[8]; dst_ptr[9] = src_ptr[9]; dst_ptr[10] = src_ptr[10]; dst_ptr[11] = src_ptr[11]; dst_ptr[12] = src_ptr[12]; dst_ptr[13] = src_ptr[13]; dst_ptr[14] = src_ptr[14]; dst_ptr[15] = src_ptr[15]; dst_ptr[16] = src_ptr[16]; dst_ptr[17] = src_ptr[17]; dst_ptr[18] = src_ptr[18]; dst_ptr[19] = src_ptr[19]; dst_ptr[20] = src_ptr[20]; dst_ptr[21] = src_ptr[21]; dst_ptr[22] = src_ptr[22]; dst_ptr[23] = src_ptr[23]; dst_ptr[24] = src_ptr[24]; dst_ptr[25] = src_ptr[25]; dst_ptr[26] = src_ptr[26]; dst_ptr[27] = src_ptr[27]; dst_ptr[28] = src_ptr[28]; dst_ptr[29] = src_ptr[29]; dst_ptr[30] = src_ptr[30]; dst_ptr[31] = src_ptr[31]; #else ((uint32_t *)dst_ptr)[0] = ((uint32_t *)src_ptr)[0] ; ((uint32_t *)dst_ptr)[1] = ((uint32_t *)src_ptr)[1] ; ((uint32_t *)dst_ptr)[2] = ((uint32_t *)src_ptr)[2] ; ((uint32_t *)dst_ptr)[3] = ((uint32_t *)src_ptr)[3] ; ((uint32_t *)dst_ptr)[4] = ((uint32_t *)src_ptr)[4] ; ((uint32_t *)dst_ptr)[5] = ((uint32_t *)src_ptr)[5] ; ((uint32_t *)dst_ptr)[6] = ((uint32_t *)src_ptr)[6] ; ((uint32_t *)dst_ptr)[7] = ((uint32_t *)src_ptr)[7] ; #endif src_ptr += src_stride; dst_ptr += dst_stride; } }