quantize_sse2.asm 10.8 KB
Newer Older
1
;
2
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
;
;  Use of this source code is governed by a BSD-style license and patent
;  grant that can be found in the LICENSE file in the root of the source
;  tree. All contributing project authors may be found in the AUTHORS
;  file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"


;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
;               short *qcoeff_ptr,short *dequant_ptr,
;               const int *default_zig_zag, short *round_ptr,
;               short *quant_ptr, short *dqcoeff_ptr,
;               unsigned short zbin_oq_value,
;               short *zbin_boost_ptr);
;
global sym(vp8_regular_quantize_b_impl_sse2)
sym(vp8_regular_quantize_b_impl_sse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 10
    push        rsi
    push        rdi
    push        rbx
    ; end prolog

    ALIGN_STACK 16, rax

    %define abs_minus_zbin_lo 0
    %define abs_minus_zbin_hi 16
    %define temp_qcoeff_lo 32
    %define temp_qcoeff_hi 48
    %define save_xmm6 64
    %define save_xmm7 80
    %define eob 96

    %define vp8_regularquantizeb_stack_size eob + 16

    sub         rsp, vp8_regularquantizeb_stack_size

Jan Kratochvil's avatar
Jan Kratochvil committed
45 46
    movdqa      OWORD PTR[rsp + save_xmm6], xmm6
    movdqa      OWORD PTR[rsp + save_xmm7], xmm7
47 48 49 50 51 52 53

    mov         rdx, arg(0)                 ;coeff_ptr
    mov         eax, arg(8)                 ;zbin_oq_value

    mov         rcx, arg(1)                 ;zbin_ptr
    movd        xmm7, eax

Jan Kratochvil's avatar
Jan Kratochvil committed
54 55
    movdqa      xmm0, OWORD PTR[rdx]
    movdqa      xmm4, OWORD PTR[rdx + 16]
56 57 58 59 60 61 62 63 64 65

    movdqa      xmm1, xmm0
    movdqa      xmm5, xmm4

    psraw       xmm0, 15                    ;sign of z (aka sz)
    psraw       xmm4, 15                    ;sign of z (aka sz)

    pxor        xmm1, xmm0
    pxor        xmm5, xmm4

Jan Kratochvil's avatar
Jan Kratochvil committed
66 67
    movdqa      xmm2, OWORD PTR[rcx]        ;load zbin_ptr
    movdqa      xmm3, OWORD PTR[rcx + 16]   ;load zbin_ptr
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83

    pshuflw     xmm7, xmm7, 0
    psubw       xmm1, xmm0                  ;x = abs(z)

    punpcklwd   xmm7, xmm7                  ;duplicated zbin_oq_value
    psubw       xmm5, xmm4                  ;x = abs(z)

    paddw       xmm2, xmm7
    paddw       xmm3, xmm7

    psubw       xmm1, xmm2                  ;sub (zbin_ptr + zbin_oq_value)
    psubw       xmm5, xmm3                  ;sub (zbin_ptr + zbin_oq_value)

    mov         rdi, arg(5)                 ;round_ptr
    mov         rsi, arg(6)                 ;quant_ptr

Jan Kratochvil's avatar
Jan Kratochvil committed
84 85
    movdqa      OWORD PTR[rsp + abs_minus_zbin_lo], xmm1
    movdqa      OWORD PTR[rsp + abs_minus_zbin_hi], xmm5
86 87 88 89

    paddw       xmm1, xmm2                  ;add (zbin_ptr + zbin_oq_value) back
    paddw       xmm5, xmm3                  ;add (zbin_ptr + zbin_oq_value) back

Jan Kratochvil's avatar
Jan Kratochvil committed
90 91
    movdqa      xmm2, OWORD PTR[rdi]
    movdqa      xmm3, OWORD PTR[rsi]
92

Jan Kratochvil's avatar
Jan Kratochvil committed
93 94
    movdqa      xmm6, OWORD PTR[rdi + 16]
    movdqa      xmm7, OWORD PTR[rsi + 16]
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110

    paddw       xmm1, xmm2
    paddw       xmm5, xmm6

    pmulhw      xmm1, xmm3
    pmulhw      xmm5, xmm7

    mov         rsi, arg(2)                 ;qcoeff_ptr
    pxor        xmm6, xmm6

    pxor        xmm1, xmm0
    pxor        xmm5, xmm4

    psubw       xmm1, xmm0
    psubw       xmm5, xmm4

Jan Kratochvil's avatar
Jan Kratochvil committed
111 112
    movdqa      OWORD PTR[rsp + temp_qcoeff_lo], xmm1
    movdqa      OWORD PTR[rsp + temp_qcoeff_hi], xmm5
113

Jan Kratochvil's avatar
Jan Kratochvil committed
114 115
    movdqa      OWORD PTR[rsi], xmm6        ;zero qcoeff
    movdqa      OWORD PTR[rsi + 16], xmm6   ;zero qcoeff
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225

    xor         rax, rax
    mov         rcx, -1

    mov         [rsp + eob], rcx
    mov         rsi, arg(9)                 ;zbin_boost_ptr

    mov         rbx, arg(4)                 ;default_zig_zag

rq_zigzag_loop:
    movsxd      rcx, DWORD PTR[rbx + rax*4] ;now we have rc
    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++

    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]

    sub         edx, edi                    ;x - zbin
    jl          rq_zigzag_1

    mov         rdi, arg(2)                 ;qcoeff_ptr

    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]

    cmp         edx, 0
    je          rq_zigzag_1

    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]

    mov         rsi, arg(9)                 ;zbin_boost_ptr
    mov         [rsp + eob], rax            ;eob = i

rq_zigzag_1:
    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++

    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
    lea         rax, [rax + 1]

    sub         edx, edi                    ;x - zbin
    jl          rq_zigzag_1a

    mov         rdi, arg(2)                 ;qcoeff_ptr

    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]

    cmp         edx, 0
    je          rq_zigzag_1a

    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]

    mov         rsi, arg(9)                 ;zbin_boost_ptr
    mov         [rsp + eob], rax            ;eob = i

rq_zigzag_1a:
    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++

    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
    lea         rax, [rax + 1]

    sub         edx, edi                    ;x - zbin
    jl          rq_zigzag_1b

    mov         rdi, arg(2)                 ;qcoeff_ptr

    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]

    cmp         edx, 0
    je          rq_zigzag_1b

    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]

    mov         rsi, arg(9)                 ;zbin_boost_ptr
    mov         [rsp + eob], rax            ;eob = i

rq_zigzag_1b:
    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++

    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
    lea         rax, [rax + 1]

    sub         edx, edi                    ;x - zbin
    jl          rq_zigzag_1c

    mov         rdi, arg(2)                 ;qcoeff_ptr

    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]

    cmp         edx, 0
    je          rq_zigzag_1c

    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]

    mov         rsi, arg(9)                 ;zbin_boost_ptr
    mov         [rsp + eob], rax            ;eob = i

rq_zigzag_1c:
    lea         rax, [rax + 1]

    cmp         rax, 16
    jl          rq_zigzag_loop

    mov         rdi, arg(2)                 ;qcoeff_ptr
    mov         rcx, arg(3)                 ;dequant_ptr
    mov         rsi, arg(7)                 ;dqcoeff_ptr

Jan Kratochvil's avatar
Jan Kratochvil committed
226 227
    movdqa      xmm2, OWORD PTR[rdi]
    movdqa      xmm3, OWORD PTR[rdi + 16]
228

Jan Kratochvil's avatar
Jan Kratochvil committed
229 230
    movdqa      xmm0, OWORD PTR[rcx]
    movdqa      xmm1, OWORD PTR[rcx + 16]
231 232 233 234

    pmullw      xmm0, xmm2
    pmullw      xmm1, xmm3

Jan Kratochvil's avatar
Jan Kratochvil committed
235 236
    movdqa      OWORD PTR[rsi], xmm0        ;store dqcoeff
    movdqa      OWORD PTR[rsi + 16], xmm1   ;store dqcoeff
237 238 239

    mov         rax, [rsp + eob]

Jan Kratochvil's avatar
Jan Kratochvil committed
240 241
    movdqa      xmm6, OWORD PTR[rsp + save_xmm6]
    movdqa      xmm7, OWORD PTR[rsp + save_xmm7]
242 243 244 245 246 247 248 249 250 251 252 253 254

    add         rax, 1

    add         rsp, vp8_regularquantizeb_stack_size
    pop         rsp

    ; begin epilog
    pop         rbx
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388


;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
;                           short *qcoeff_ptr,short *dequant_ptr,
;                           short *scan_mask, short *round_ptr,
;                           short *quant_ptr, short *dqcoeff_ptr);
global sym(vp8_fast_quantize_b_impl_ssse2)
sym(vp8_fast_quantize_b_impl_ssse2):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    push        rsi
    push        rdi
    push        rbx
    ; end prolog

    ALIGN_STACK 16, rax

    %define save_xmm6  0
    %define save_xmm7 16

    %define vp8_fastquantizeb_stack_size save_xmm7 + 16

    sub         rsp, vp8_fastquantizeb_stack_size

    movdqa      XMMWORD PTR[rsp + save_xmm6], xmm6
    movdqa      XMMWORD PTR[rsp + save_xmm7], xmm7

    mov         rdx, arg(0)                 ;coeff_ptr
    mov         rcx, arg(2)                 ;dequant_ptr
    mov         rax, arg(3)                 ;scan_mask
    mov         rdi, arg(4)                 ;round_ptr
    mov         rsi, arg(5)                 ;quant_ptr

    movdqa      xmm0, XMMWORD PTR[rdx]
    movdqa      xmm4, XMMWORD PTR[rdx + 16]

    movdqa      xmm6, XMMWORD PTR[rdi]      ;round lo
    movdqa      xmm7, XMMWORD PTR[rdi + 16] ;round hi

    movdqa      xmm1, xmm0
    movdqa      xmm5, xmm4

    psraw       xmm0, 15                    ;sign of z (aka sz)
    psraw       xmm4, 15                    ;sign of z (aka sz)

    pxor        xmm1, xmm0
    pxor        xmm5, xmm4
    psubw       xmm1, xmm0                  ;x = abs(z)
    psubw       xmm5, xmm4                  ;x = abs(z)

    paddw       xmm1, xmm6
    paddw       xmm5, xmm7

    pmulhw      xmm1, XMMWORD PTR[rsi]
    pmulhw      xmm5, XMMWORD PTR[rsi + 16]

    mov         rdi, arg(1)                 ;qcoeff_ptr
    mov         rsi, arg(6)                 ;dqcoeff_ptr

    movdqa      xmm6, XMMWORD PTR[rcx]
    movdqa      xmm7, XMMWORD PTR[rcx + 16]

    pxor        xmm1, xmm0
    pxor        xmm5, xmm4
    psubw       xmm1, xmm0
    psubw       xmm5, xmm4

    movdqa      XMMWORD PTR[rdi], xmm1
    movdqa      XMMWORD PTR[rdi + 16], xmm5

    pmullw      xmm6, xmm1
    pmullw      xmm7, xmm5

    movdqa      xmm2, XMMWORD PTR[rax]
    movdqa      xmm3, XMMWORD PTR[rax+16];

    pxor        xmm4, xmm4            ;clear all bits
    pcmpeqw     xmm1, xmm4
    pcmpeqw     xmm5, xmm4

    pcmpeqw     xmm4, xmm4            ;set all bits
    pxor        xmm1, xmm4
    pxor        xmm5, xmm4

    psrlw       xmm1, 15
    psrlw       xmm5, 15

    pmaddwd     xmm1, xmm2
    pmaddwd     xmm5, xmm3

    movq        xmm2, xmm1
    movq        xmm3, xmm5

    psrldq      xmm1, 8
    psrldq      xmm5, 8

    paddd       xmm1, xmm5
    paddd       xmm2, xmm3

    paddd       xmm1, xmm2
    movq        xmm5, xmm1

    psrldq      xmm1, 4
    paddd       xmm5, xmm1

    movq        rcx,  xmm5
    and         rcx,  0xffff

    xor         rdx,  rdx
    sub         rdx,  rcx

    bsr         rax,  rcx
    inc         rax

    sar         rdx,  31
    and         rax,  rdx

    movdqa      XMMWORD PTR[rsi], xmm6        ;store dqcoeff
    movdqa      XMMWORD PTR[rsi + 16], xmm7   ;store dqcoeff

    movdqa      xmm6, XMMWORD PTR[rsp + save_xmm6]
    movdqa      xmm7, XMMWORD PTR[rsp + save_xmm7]

    add         rsp, vp8_fastquantizeb_stack_size
    pop         rsp

    ; begin epilog
    pop         rbx
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret