quantize_sse2.asm 8.19 KB
Newer Older
1
;
2
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 4 5 6 7 8 9 10 11 12 13
;
;  Use of this source code is governed by a BSD-style license and patent
;  grant that can be found in the LICENSE file in the root of the source
;  tree. All contributing project authors may be found in the AUTHORS
;  file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"


Johann's avatar
Johann committed
14 15 16 17 18 19 20 21 22
;int vp8_regular_quantize_b_impl_sse2(
;               short *coeff_ptr,
;               short *zbin_ptr,
;               short *qcoeff_ptr,
;               short *dequant_ptr,
;               const int *default_zig_zag,
;               short *round_ptr,
;               short *quant_ptr,
;               short *dqcoeff_ptr,
23
;               unsigned short zbin_oq_value,
Johann's avatar
Johann committed
24 25
;               short *zbin_boost_ptr,
;               short *quant_shift);
26 27 28 29 30
;
global sym(vp8_regular_quantize_b_impl_sse2)
sym(vp8_regular_quantize_b_impl_sse2):
    push        rbp
    mov         rbp, rsp
Johann's avatar
Johann committed
31 32
    SHADOW_ARGS_TO_STACK 11
    SAVE_XMM
33 34 35 36
    push        rsi
    push        rdi
    push        rbx
    ALIGN_STACK 16, rax
Johann's avatar
Johann committed
37 38 39 40 41 42 43
    %define abs_minus_zbin    0
    %define temp_qcoeff       32
    %define qcoeff            64
    %define eob_tmp           96
    %define stack_size        112
    sub         rsp, stack_size
    ; end prolog
44

Johann's avatar
Johann committed
45 46 47 48 49
    mov         rdx, arg(0)                 ; coeff_ptr
    mov         rcx, arg(1)                 ; zbin_ptr
    movd        xmm7, arg(8)                ; zbin_oq_value
    mov         rdi, arg(5)                 ; round_ptr
    mov         rsi, arg(6)                 ; quant_ptr
50

Johann's avatar
Johann committed
51
    ; z
Jan Kratochvil's avatar
Jan Kratochvil committed
52 53
    movdqa      xmm0, OWORD PTR[rdx]
    movdqa      xmm4, OWORD PTR[rdx + 16]
54

Johann's avatar
Johann committed
55 56 57
    pshuflw     xmm7, xmm7, 0
    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value

58 59 60
    movdqa      xmm1, xmm0
    movdqa      xmm5, xmm4

Johann's avatar
Johann committed
61 62 63
    ; sz
    psraw       xmm0, 15
    psraw       xmm4, 15
64

Johann's avatar
Johann committed
65
    ; (z ^ sz)
66 67 68
    pxor        xmm1, xmm0
    pxor        xmm5, xmm4

Johann's avatar
Johann committed
69 70 71
    ; x = abs(z)
    psubw       xmm1, xmm0
    psubw       xmm5, xmm4
72

Johann's avatar
Johann committed
73 74
    movdqa      xmm2, OWORD PTR[rcx]
    movdqa      xmm3, OWORD PTR[rcx + 16]
75

Johann's avatar
Johann committed
76
    ; *zbin_ptr + zbin_oq_value
77 78 79
    paddw       xmm2, xmm7
    paddw       xmm3, xmm7

Johann's avatar
Johann committed
80 81 82 83 84
    ; x - (*zbin_ptr + zbin_oq_value)
    psubw       xmm1, xmm2
    psubw       xmm5, xmm3
    movdqa      OWORD PTR[rsp + abs_minus_zbin], xmm1
    movdqa      OWORD PTR[rsp + abs_minus_zbin + 16], xmm5
85

Johann's avatar
Johann committed
86 87 88
    ; add (zbin_ptr + zbin_oq_value) back
    paddw       xmm1, xmm2
    paddw       xmm5, xmm3
89

Jan Kratochvil's avatar
Jan Kratochvil committed
90 91
    movdqa      xmm2, OWORD PTR[rdi]
    movdqa      xmm6, OWORD PTR[rdi + 16]
Johann's avatar
Johann committed
92 93

    movdqa      xmm3, OWORD PTR[rsi]
Jan Kratochvil's avatar
Jan Kratochvil committed
94
    movdqa      xmm7, OWORD PTR[rsi + 16]
95

Johann's avatar
Johann committed
96
    ; x + round
97 98 99
    paddw       xmm1, xmm2
    paddw       xmm5, xmm6

Johann's avatar
Johann committed
100 101 102
    ; y = x * quant_ptr >> 16
    pmulhw      xmm3, xmm1
    pmulhw      xmm7, xmm5
103

Johann's avatar
Johann committed
104 105 106
    ; y += x
    paddw       xmm1, xmm3
    paddw       xmm5, xmm7
107

Johann's avatar
Johann committed
108 109
    movdqa      OWORD PTR[rsp + temp_qcoeff], xmm1
    movdqa      OWORD PTR[rsp + temp_qcoeff + 16], xmm5
110

Johann's avatar
Johann committed
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
    pxor        xmm6, xmm6
    ; zero qcoeff
    movdqa      OWORD PTR[rsp + qcoeff], xmm6
    movdqa      OWORD PTR[rsp + qcoeff + 16], xmm6

    mov         [rsp + eob_tmp], DWORD -1   ; eob
    mov         rsi, arg(9)                 ; zbin_boost_ptr
    mov         rdi, arg(4)                 ; default_zig_zag
    mov         rax, arg(10)                ; quant_shift_ptr

%macro ZIGZAG_LOOP 2
rq_zigzag_loop_%1:
    movsxd      rdx, DWORD PTR[rdi + (%1 * 4)] ; rc
    movsx       ebx, WORD PTR [rsi]         ; *zbin_boost_ptr
    lea         rsi, [rsi + 2]              ; zbin_boost_ptr++

    ; x
    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]

    ; if (x >= zbin)
    sub         ecx, ebx                    ; x - zbin
    jl          rq_zigzag_loop_%2           ; x < zbin

    movsx       ebx, WORD PTR[rsp + temp_qcoeff + rdx *2]

    ; downshift by quant_shift[rdx]
    movsx       ecx, WORD PTR[rax + rdx*2]  ; quant_shift_ptr[rc]
    sar         ebx, cl                     ; also sets Z bit
    je          rq_zigzag_loop_%2           ; !y
    mov         WORD PTR[rsp + qcoeff + rdx * 2], bx ;qcoeff_ptr[rc] = temp_qcoeff[rc]

    mov         rsi, arg(9)                 ; reset to b->zrun_zbin_boost
    mov         [rsp + eob_tmp], DWORD %1   ; eob = i
%endmacro
ZIGZAG_LOOP 0, 1
ZIGZAG_LOOP 1, 2
ZIGZAG_LOOP 2, 3
ZIGZAG_LOOP 3, 4
ZIGZAG_LOOP 4, 5
ZIGZAG_LOOP 5, 6
ZIGZAG_LOOP 6, 7
ZIGZAG_LOOP 7, 8
ZIGZAG_LOOP 8, 9
ZIGZAG_LOOP 9, 10
ZIGZAG_LOOP 10, 11
ZIGZAG_LOOP 11, 12
ZIGZAG_LOOP 12, 13
ZIGZAG_LOOP 13, 14
ZIGZAG_LOOP 14, 15
ZIGZAG_LOOP 15, end
rq_zigzag_loop_end:

    mov         rbx, arg(2)                 ; qcoeff_ptr
    mov         rcx, arg(3)                 ; dequant_ptr
    mov         rsi, arg(7)                 ; dqcoeff_ptr
    mov         rax, [rsp + eob_tmp]        ; eob

    movdqa      xmm2, OWORD PTR[rsp + qcoeff]
    movdqa      xmm3, OWORD PTR[rsp + qcoeff + 16]

    ; y ^ sz
    pxor        xmm2, xmm0
    pxor        xmm3, xmm4
    ; x = (y ^ sz) - sz
    psubw       xmm2, xmm0
    psubw       xmm3, xmm4
177

Jan Kratochvil's avatar
Jan Kratochvil committed
178 179
    movdqa      xmm0, OWORD PTR[rcx]
    movdqa      xmm1, OWORD PTR[rcx + 16]
180 181 182 183

    pmullw      xmm0, xmm2
    pmullw      xmm1, xmm3

Johann's avatar
Johann committed
184 185 186 187
    movdqa      OWORD PTR[rbx], xmm2
    movdqa      OWORD PTR[rbx + 16], xmm3
    movdqa      OWORD PTR[rsi], xmm0        ; store dqcoeff
    movdqa      OWORD PTR[rsi + 16], xmm1   ; store dqcoeff
188 189 190 191

    add         rax, 1

    ; begin epilog
Johann's avatar
Johann committed
192 193
    add         rsp, stack_size
    pop         rsp
194 195 196
    pop         rbx
    pop         rdi
    pop         rsi
Johann's avatar
Johann committed
197
    RESTORE_XMM
198 199 200
    UNSHADOW_ARGS
    pop         rbp
    ret
201 202 203

;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
;                           short *qcoeff_ptr,short *dequant_ptr,
204
;                           short *inv_scan_order, short *round_ptr,
205
;                           short *quant_ptr, short *dqcoeff_ptr);
206 207
global sym(vp8_fast_quantize_b_impl_sse2)
sym(vp8_fast_quantize_b_impl_sse2):
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    push        rsi
    push        rdi
    ; end prolog

    mov         rdx, arg(0)                 ;coeff_ptr
    mov         rcx, arg(2)                 ;dequant_ptr
    mov         rdi, arg(4)                 ;round_ptr
    mov         rsi, arg(5)                 ;quant_ptr

    movdqa      xmm0, XMMWORD PTR[rdx]
    movdqa      xmm4, XMMWORD PTR[rdx + 16]

223 224
    movdqa      xmm2, XMMWORD PTR[rdi]      ;round lo
    movdqa      xmm3, XMMWORD PTR[rdi + 16] ;round hi
225 226 227 228 229 230 231 232 233 234 235 236

    movdqa      xmm1, xmm0
    movdqa      xmm5, xmm4

    psraw       xmm0, 15                    ;sign of z (aka sz)
    psraw       xmm4, 15                    ;sign of z (aka sz)

    pxor        xmm1, xmm0
    pxor        xmm5, xmm4
    psubw       xmm1, xmm0                  ;x = abs(z)
    psubw       xmm5, xmm4                  ;x = abs(z)

237 238
    paddw       xmm1, xmm2
    paddw       xmm5, xmm3
239 240 241 242 243 244 245

    pmulhw      xmm1, XMMWORD PTR[rsi]
    pmulhw      xmm5, XMMWORD PTR[rsi + 16]

    mov         rdi, arg(1)                 ;qcoeff_ptr
    mov         rsi, arg(6)                 ;dqcoeff_ptr

246 247
    movdqa      xmm2, XMMWORD PTR[rcx]
    movdqa      xmm3, XMMWORD PTR[rcx + 16]
248 249 250 251 252 253 254 255 256

    pxor        xmm1, xmm0
    pxor        xmm5, xmm4
    psubw       xmm1, xmm0
    psubw       xmm5, xmm4

    movdqa      XMMWORD PTR[rdi], xmm1
    movdqa      XMMWORD PTR[rdi + 16], xmm5

257 258
    pmullw      xmm2, xmm1
    pmullw      xmm3, xmm5
259

260
    mov         rdi, arg(3)                 ;inv_scan_order
261

262 263
    ; Start with 16
    pxor        xmm4, xmm4                  ;clear all bits
264 265 266
    pcmpeqw     xmm1, xmm4
    pcmpeqw     xmm5, xmm4

267
    pcmpeqw     xmm4, xmm4                  ;set all bits
268 269 270
    pxor        xmm1, xmm4
    pxor        xmm5, xmm4

271 272
    pand        xmm1, XMMWORD PTR[rdi]
    pand        xmm5, XMMWORD PTR[rdi+16]
273

274
    pmaxsw      xmm1, xmm5
275

276 277
    ; now down to 8
    pshufd      xmm5, xmm1, 00001110b
278

279
    pmaxsw      xmm1, xmm5
280

281 282
    ; only 4 left
    pshuflw     xmm5, xmm1, 00001110b
283

284
    pmaxsw      xmm1, xmm5
285

286 287
    ; okay, just 2!
    pshuflw     xmm5, xmm1, 00000001b
288

289
    pmaxsw      xmm1, xmm5
290

291 292
    movd        rax, xmm1
    and         rax, 0xff
293

294 295
    movdqa      XMMWORD PTR[rsi], xmm2        ;store dqcoeff
    movdqa      XMMWORD PTR[rsi + 16], xmm3   ;store dqcoeff
296 297 298 299 300 301 302

    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret