sad_ssse3.asm 13 KB
Newer Older
John Koleszar's avatar
John Koleszar committed
1
;
2
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
John Koleszar's avatar
John Koleszar committed
3
;
4
;  Use of this source code is governed by a BSD-style license
5 6
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
7
;  in the file PATENTS.  All contributing project authors may
8
;  be found in the AUTHORS file in the root of the source tree.
John Koleszar's avatar
John Koleszar committed
9 10 11 12 13 14 15
;


%include "vpx_ports/x86_abi_support.asm"

%macro PROCESS_16X2X3 1
%if %1
16 17 18 19
        movdqa          xmm0,       XMMWORD PTR [rsi]
        lddqu           xmm5,       XMMWORD PTR [rdi]
        lddqu           xmm6,       XMMWORD PTR [rdi+1]
        lddqu           xmm7,       XMMWORD PTR [rdi+2]
John Koleszar's avatar
John Koleszar committed
20 21 22 23 24

        psadbw          xmm5,       xmm0
        psadbw          xmm6,       xmm0
        psadbw          xmm7,       xmm0
%else
25 26 27 28
        movdqa          xmm0,       XMMWORD PTR [rsi]
        lddqu           xmm1,       XMMWORD PTR [rdi]
        lddqu           xmm2,       XMMWORD PTR [rdi+1]
        lddqu           xmm3,       XMMWORD PTR [rdi+2]
John Koleszar's avatar
John Koleszar committed
29 30 31 32 33 34 35 36 37

        psadbw          xmm1,       xmm0
        psadbw          xmm2,       xmm0
        psadbw          xmm3,       xmm0

        paddw           xmm5,       xmm1
        paddw           xmm6,       xmm2
        paddw           xmm7,       xmm3
%endif
38 39 40 41
        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
John Koleszar's avatar
John Koleszar committed
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56

        lea             rsi,        [rsi+rax*2]
        lea             rdi,        [rdi+rdx*2]

        psadbw          xmm1,       xmm0
        psadbw          xmm2,       xmm0
        psadbw          xmm3,       xmm0

        paddw           xmm5,       xmm1
        paddw           xmm6,       xmm2
        paddw           xmm7,       xmm3
%endmacro

%macro PROCESS_16X2X3_OFFSET 2
%if %1
57 58 59
        movdqa          xmm0,       XMMWORD PTR [rsi]
        movdqa          xmm4,       XMMWORD PTR [rdi]
        movdqa          xmm7,       XMMWORD PTR [rdi+16]
John Koleszar's avatar
John Koleszar committed
60 61 62 63 64 65 66 67 68 69 70 71 72

        movdqa          xmm5,       xmm7
        palignr         xmm5,       xmm4,       %2

        movdqa          xmm6,       xmm7
        palignr         xmm6,       xmm4,       (%2+1)

        palignr         xmm7,       xmm4,       (%2+2)

        psadbw          xmm5,       xmm0
        psadbw          xmm6,       xmm0
        psadbw          xmm7,       xmm0
%else
73 74 75
        movdqa          xmm0,       XMMWORD PTR [rsi]
        movdqa          xmm4,       XMMWORD PTR [rdi]
        movdqa          xmm3,       XMMWORD PTR [rdi+16]
John Koleszar's avatar
John Koleszar committed
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92

        movdqa          xmm1,       xmm3
        palignr         xmm1,       xmm4,       %2

        movdqa          xmm2,       xmm3
        palignr         xmm2,       xmm4,       (%2+1)

        palignr         xmm3,       xmm4,       (%2+2)

        psadbw          xmm1,       xmm0
        psadbw          xmm2,       xmm0
        psadbw          xmm3,       xmm0

        paddw           xmm5,       xmm1
        paddw           xmm6,       xmm2
        paddw           xmm7,       xmm3
%endif
93 94 95
        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
John Koleszar's avatar
John Koleszar committed
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148

        movdqa          xmm1,       xmm3
        palignr         xmm1,       xmm4,       %2

        movdqa          xmm2,       xmm3
        palignr         xmm2,       xmm4,       (%2+1)

        palignr         xmm3,       xmm4,       (%2+2)

        lea             rsi,        [rsi+rax*2]
        lea             rdi,        [rdi+rdx*2]

        psadbw          xmm1,       xmm0
        psadbw          xmm2,       xmm0
        psadbw          xmm3,       xmm0

        paddw           xmm5,       xmm1
        paddw           xmm6,       xmm2
        paddw           xmm7,       xmm3
%endmacro

%macro PROCESS_16X16X3_OFFSET 2
%2_aligned_by_%1:

        sub             rdi,        %1

        PROCESS_16X2X3_OFFSET 1, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1

        jmp             %2_store_off

%endmacro

%macro PROCESS_16X8X3_OFFSET 2
%2_aligned_by_%1:

        sub             rdi,        %1

        PROCESS_16X2X3_OFFSET 1, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1
        PROCESS_16X2X3_OFFSET 0, %1

        jmp             %2_store_off

%endmacro

Johann's avatar
Johann committed
149
;void int vpx_sad16x16x3_ssse3(
John Koleszar's avatar
John Koleszar committed
150 151 152 153 154
;    unsigned char *src_ptr,
;    int  src_stride,
;    unsigned char *ref_ptr,
;    int  ref_stride,
;    int  *results)
Johann's avatar
Johann committed
155 156
global sym(vpx_sad16x16x3_ssse3) PRIVATE
sym(vpx_sad16x16x3_ssse3):
John Koleszar's avatar
John Koleszar committed
157 158 159
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
160
    SAVE_XMM 7
John Koleszar's avatar
John Koleszar committed
161 162 163 164 165 166 167 168 169 170 171
    push        rsi
    push        rdi
    push        rcx
    ; end prolog

        mov             rsi,        arg(0) ;src_ptr
        mov             rdi,        arg(2) ;ref_ptr

        mov             rdx,        0xf
        and             rdx,        rdi

Johann's avatar
Johann committed
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
        jmp .vpx_sad16x16x3_ssse3_skiptable
.vpx_sad16x16x3_ssse3_jumptable:
        dd .vpx_sad16x16x3_ssse3_aligned_by_0  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_1  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_2  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_3  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_4  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_5  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_6  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_7  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_8  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_9  - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
        dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
.vpx_sad16x16x3_ssse3_skiptable:

        call .vpx_sad16x16x3_ssse3_do_jump
.vpx_sad16x16x3_ssse3_do_jump:
John Koleszar's avatar
John Koleszar committed
194
        pop             rcx                         ; get the address of do_jump
Johann's avatar
Johann committed
195 196
        mov             rax,  .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
        add             rax,  rcx  ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
John Koleszar's avatar
John Koleszar committed
197 198 199 200 201 202 203 204 205

        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
        add             rcx,        rax

        movsxd          rax,        dword ptr arg(1) ;src_stride
        movsxd          rdx,        dword ptr arg(3) ;ref_stride

        jmp             rcx

Johann's avatar
Johann committed
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
        PROCESS_16X16X3_OFFSET 0,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 1,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 2,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 3,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 4,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 5,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 6,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 7,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 8,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 9,  .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
        PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3

.vpx_sad16x16x3_ssse3_aligned_by_15:
John Koleszar's avatar
John Koleszar committed
223 224 225 226 227 228 229 230 231
        PROCESS_16X2X3 1
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0

Johann's avatar
Johann committed
232
.vpx_sad16x16x3_ssse3_store_off:
John Koleszar's avatar
John Koleszar committed
233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
        mov             rdi,        arg(4) ;Results

        movq            xmm0,       xmm5
        psrldq          xmm5,       8

        paddw           xmm0,       xmm5
        movd            [rdi],      xmm0
;-
        movq            xmm0,       xmm6
        psrldq          xmm6,       8

        paddw           xmm0,       xmm6
        movd            [rdi+4],    xmm0
;-
        movq            xmm0,       xmm7
        psrldq          xmm7,       8

        paddw           xmm0,       xmm7
        movd            [rdi+8],    xmm0

    ; begin epilog
    pop         rcx
    pop         rdi
    pop         rsi
257
    RESTORE_XMM
John Koleszar's avatar
John Koleszar committed
258 259 260 261
    UNSHADOW_ARGS
    pop         rbp
    ret

Johann's avatar
Johann committed
262
;void int vpx_sad16x8x3_ssse3(
John Koleszar's avatar
John Koleszar committed
263 264 265 266 267
;    unsigned char *src_ptr,
;    int  src_stride,
;    unsigned char *ref_ptr,
;    int  ref_stride,
;    int  *results)
Johann's avatar
Johann committed
268 269
global sym(vpx_sad16x8x3_ssse3) PRIVATE
sym(vpx_sad16x8x3_ssse3):
John Koleszar's avatar
John Koleszar committed
270 271 272
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
273
    SAVE_XMM 7
John Koleszar's avatar
John Koleszar committed
274 275 276 277 278 279 280 281 282 283 284
    push        rsi
    push        rdi
    push        rcx
    ; end prolog

        mov             rsi,        arg(0) ;src_ptr
        mov             rdi,        arg(2) ;ref_ptr

        mov             rdx,        0xf
        and             rdx,        rdi

Johann's avatar
Johann committed
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
        jmp .vpx_sad16x8x3_ssse3_skiptable
.vpx_sad16x8x3_ssse3_jumptable:
        dd .vpx_sad16x8x3_ssse3_aligned_by_0  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_1  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_2  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_3  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_4  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_5  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_6  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_7  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_8  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_9  - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
        dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
.vpx_sad16x8x3_ssse3_skiptable:

        call .vpx_sad16x8x3_ssse3_do_jump
.vpx_sad16x8x3_ssse3_do_jump:
John Koleszar's avatar
John Koleszar committed
307
        pop             rcx                         ; get the address of do_jump
Johann's avatar
Johann committed
308 309
        mov             rax,  .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
        add             rax,  rcx  ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
John Koleszar's avatar
John Koleszar committed
310 311 312 313 314 315 316 317 318

        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
        add             rcx,        rax

        movsxd          rax,        dword ptr arg(1) ;src_stride
        movsxd          rdx,        dword ptr arg(3) ;ref_stride

        jmp             rcx

Johann's avatar
Johann committed
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
        PROCESS_16X8X3_OFFSET 0,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 1,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 2,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 3,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 4,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 5,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 6,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 7,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 8,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 9,  .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
        PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3

.vpx_sad16x8x3_ssse3_aligned_by_15:
John Koleszar's avatar
John Koleszar committed
336 337 338 339 340 341

        PROCESS_16X2X3 1
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0
        PROCESS_16X2X3 0

Johann's avatar
Johann committed
342
.vpx_sad16x8x3_ssse3_store_off:
John Koleszar's avatar
John Koleszar committed
343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
        mov             rdi,        arg(4) ;Results

        movq            xmm0,       xmm5
        psrldq          xmm5,       8

        paddw           xmm0,       xmm5
        movd            [rdi],      xmm0
;-
        movq            xmm0,       xmm6
        psrldq          xmm6,       8

        paddw           xmm0,       xmm6
        movd            [rdi+4],    xmm0
;-
        movq            xmm0,       xmm7
        psrldq          xmm7,       8

        paddw           xmm0,       xmm7
        movd            [rdi+8],    xmm0

    ; begin epilog
    pop         rcx
    pop         rdi
    pop         rsi
367
    RESTORE_XMM
John Koleszar's avatar
John Koleszar committed
368 369 370
    UNSHADOW_ARGS
    pop         rbp
    ret