Commit 996697e2 authored by Christophe Gisquet's avatar Christophe Gisquet Committed by Janne Grunau
Browse files

x86: float dsp: unroll SSE versions



vector_fmul and vector_fmac_scalar are guaranteed that they can process in
batch of 16 elements, but their SSE versions only does 8 at a time.

Therefore, unroll them a bit.
299 to 261c for 256 elements in vector_fmac_scalar on Arrandale/Win64.
Signed-off-by: default avatarJanne Grunau <janne-libav@jannau.net>
parent ef010f08
...@@ -27,17 +27,21 @@ SECTION .text ...@@ -27,17 +27,21 @@ SECTION .text
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro VECTOR_FMUL 0 %macro VECTOR_FMUL 0
cglobal vector_fmul, 4,4,2, dst, src0, src1, len cglobal vector_fmul, 4,4,2, dst, src0, src1, len
lea lenq, [lend*4 - 2*mmsize] lea lenq, [lend*4 - 64]
ALIGN 16 ALIGN 16
.loop: .loop:
mova m0, [src0q + lenq] %assign a 0
mova m1, [src0q + lenq + mmsize] %rep 32/mmsize
mulps m0, m0, [src1q + lenq] mova m0, [src0q + lenq + (a+0)*mmsize]
mulps m1, m1, [src1q + lenq + mmsize] mova m1, [src0q + lenq + (a+1)*mmsize]
mova [dstq + lenq], m0 mulps m0, m0, [src1q + lenq + (a+0)*mmsize]
mova [dstq + lenq + mmsize], m1 mulps m1, m1, [src1q + lenq + (a+1)*mmsize]
mova [dstq + lenq + (a+0)*mmsize], m0
mova [dstq + lenq + (a+1)*mmsize], m1
%assign a a+2
%endrep
sub lenq, 2*mmsize sub lenq, 64
jge .loop jge .loop
REP_RET REP_RET
%endmacro %endmacro
...@@ -68,15 +72,19 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len ...@@ -68,15 +72,19 @@ cglobal vector_fmac_scalar, 4,4,3, dst, src, mul, len
vinsertf128 m0, m0, xmm0, 1 vinsertf128 m0, m0, xmm0, 1
%endif %endif
%endif %endif
lea lenq, [lend*4-2*mmsize] lea lenq, [lend*4-64]
.loop: .loop:
mulps m1, m0, [srcq+lenq ] %assign a 0
mulps m2, m0, [srcq+lenq+mmsize] %rep 32/mmsize
addps m1, m1, [dstq+lenq ] mulps m1, m0, [srcq+lenq+(a+0)*mmsize]
addps m2, m2, [dstq+lenq+mmsize] mulps m2, m0, [srcq+lenq+(a+1)*mmsize]
mova [dstq+lenq ], m1 addps m1, m1, [dstq+lenq+(a+0)*mmsize]
mova [dstq+lenq+mmsize], m2 addps m2, m2, [dstq+lenq+(a+1)*mmsize]
sub lenq, 2*mmsize mova [dstq+lenq+(a+0)*mmsize], m1
mova [dstq+lenq+(a+1)*mmsize], m2
%assign a a+2
%endrep
sub lenq, 64
jge .loop jge .loop
REP_RET REP_RET
%endmacro %endmacro
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment