Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
BC
public
external
libvpx
Commits
c02fdd02
Commit
c02fdd02
authored
Aug 21, 2017
by
Johann
Browse files
quantize: ignore skip_block in x86
Change-Id: I9a963e99f08761f0c8d6a305619270b2f1c4edf8
parent
b527b473
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
251 additions
and
356 deletions
+251
-356
vpx_dsp/x86/highbd_quantize_intrin_sse2.c
vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+75
-74
vpx_dsp/x86/quantize_avx_x86_64.asm
vpx_dsp/x86/quantize_avx_x86_64.asm
+0
-46
vpx_dsp/x86/quantize_sse2.c
vpx_dsp/x86/quantize_sse2.c
+173
-181
vpx_dsp/x86/quantize_ssse3.c
vpx_dsp/x86/quantize_ssse3.c
+3
-12
vpx_dsp/x86/quantize_ssse3_x86_64.asm
vpx_dsp/x86/quantize_ssse3_x86_64.asm
+0
-43
No files found.
vpx_dsp/x86/highbd_quantize_intrin_sse2.c
View file @
c02fdd02
...
...
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <emmintrin.h>
#include "vpx_dsp/vpx_dsp_common.h"
...
...
@@ -37,54 +38,54 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
nzbins
[
1
]
=
_mm_sub_epi32
(
nzbins
[
1
],
zbins
[
1
]);
(
void
)
scan
;
(
void
)
skip_block
;
assert
(
!
skip_block
);
memset
(
qcoeff_ptr
,
0
,
count
*
sizeof
(
*
qcoeff_ptr
));
memset
(
dqcoeff_ptr
,
0
,
count
*
sizeof
(
*
dqcoeff_ptr
));
if
(
!
skip_block
)
{
// Pre-scan pass
for
(
i
=
((
int
)
count
/
4
)
-
1
;
i
>=
0
;
i
--
)
{
__m128i
coeffs
,
cmp1
,
cmp2
;
int
test
;
coeffs
=
_mm_load_si128
((
const
__m128i
*
)(
coeff_ptr
+
i
*
4
));
cmp1
=
_mm_cmplt_epi32
(
coeffs
,
zbins
[
i
!=
0
]);
cmp2
=
_mm_cmpgt_epi32
(
coeffs
,
nzbins
[
i
!=
0
]);
cmp1
=
_mm_and_si128
(
cmp1
,
cmp2
);
test
=
_mm_movemask_epi8
(
cmp1
);
if
(
test
==
0xffff
)
non_zero_regs
--
;
else
break
;
}
// Pre-scan pass
for
(
i
=
((
int
)
count
/
4
)
-
1
;
i
>=
0
;
i
--
)
{
__m128i
coeffs
,
cmp1
,
cmp2
;
int
test
;
coeffs
=
_mm_load_si128
((
const
__m128i
*
)(
coeff_ptr
+
i
*
4
));
cmp1
=
_mm_cmplt_epi32
(
coeffs
,
zbins
[
i
!=
0
]);
cmp2
=
_mm_cmpgt_epi32
(
coeffs
,
nzbins
[
i
!=
0
]);
cmp1
=
_mm_and_si128
(
cmp1
,
cmp2
);
test
=
_mm_movemask_epi8
(
cmp1
);
if
(
test
==
0xffff
)
non_zero_regs
--
;
else
break
;
}
// Quantization pass:
for
(
i
=
0
;
i
<
non_zero_regs
;
i
++
)
{
__m128i
coeffs
,
coeffs_sign
,
tmp1
,
tmp2
;
int
test
;
int
abs_coeff
[
4
];
int
coeff_sign
[
4
];
coeffs
=
_mm_load_si128
((
const
__m128i
*
)(
coeff_ptr
+
i
*
4
));
coeffs_sign
=
_mm_srai_epi32
(
coeffs
,
31
);
coeffs
=
_mm_sub_epi32
(
_mm_xor_si128
(
coeffs
,
coeffs_sign
),
coeffs_sign
);
tmp1
=
_mm_cmpgt_epi32
(
coeffs
,
zbins
[
i
!=
0
]);
tmp2
=
_mm_cmpeq_epi32
(
coeffs
,
zbins
[
i
!=
0
]);
tmp1
=
_mm_or_si128
(
tmp1
,
tmp2
);
test
=
_mm_movemask_epi8
(
tmp1
);
_mm_storeu_si128
((
__m128i
*
)
abs_coeff
,
coeffs
);
_mm_storeu_si128
((
__m128i
*
)
coeff_sign
,
coeffs_sign
);
for
(
j
=
0
;
j
<
4
;
j
++
)
{
if
(
test
&
(
1
<<
(
4
*
j
)))
{
int
k
=
4
*
i
+
j
;
const
int64_t
tmp3
=
abs_coeff
[
j
]
+
round_ptr
[
k
!=
0
];
const
int64_t
tmp4
=
((
tmp3
*
quant_ptr
[
k
!=
0
])
>>
16
)
+
tmp3
;
const
uint32_t
abs_qcoeff
=
(
uint32_t
)((
tmp4
*
quant_shift_ptr
[
k
!=
0
])
>>
16
);
qcoeff_ptr
[
k
]
=
(
int
)(
abs_qcoeff
^
coeff_sign
[
j
])
-
coeff_sign
[
j
];
dqcoeff_ptr
[
k
]
=
qcoeff_ptr
[
k
]
*
dequant_ptr
[
k
!=
0
];
if
(
abs_qcoeff
)
eob_i
=
iscan
[
k
]
>
eob_i
?
iscan
[
k
]
:
eob_i
;
}
// Quantization pass:
for
(
i
=
0
;
i
<
non_zero_regs
;
i
++
)
{
__m128i
coeffs
,
coeffs_sign
,
tmp1
,
tmp2
;
int
test
;
int
abs_coeff
[
4
];
int
coeff_sign
[
4
];
coeffs
=
_mm_load_si128
((
const
__m128i
*
)(
coeff_ptr
+
i
*
4
));
coeffs_sign
=
_mm_srai_epi32
(
coeffs
,
31
);
coeffs
=
_mm_sub_epi32
(
_mm_xor_si128
(
coeffs
,
coeffs_sign
),
coeffs_sign
);
tmp1
=
_mm_cmpgt_epi32
(
coeffs
,
zbins
[
i
!=
0
]);
tmp2
=
_mm_cmpeq_epi32
(
coeffs
,
zbins
[
i
!=
0
]);
tmp1
=
_mm_or_si128
(
tmp1
,
tmp2
);
test
=
_mm_movemask_epi8
(
tmp1
);
_mm_storeu_si128
((
__m128i
*
)
abs_coeff
,
coeffs
);
_mm_storeu_si128
((
__m128i
*
)
coeff_sign
,
coeffs_sign
);
for
(
j
=
0
;
j
<
4
;
j
++
)
{
if
(
test
&
(
1
<<
(
4
*
j
)))
{
int
k
=
4
*
i
+
j
;
const
int64_t
tmp3
=
abs_coeff
[
j
]
+
round_ptr
[
k
!=
0
];
const
int64_t
tmp4
=
((
tmp3
*
quant_ptr
[
k
!=
0
])
>>
16
)
+
tmp3
;
const
uint32_t
abs_qcoeff
=
(
uint32_t
)((
tmp4
*
quant_shift_ptr
[
k
!=
0
])
>>
16
);
qcoeff_ptr
[
k
]
=
(
int
)(
abs_qcoeff
^
coeff_sign
[
j
])
-
coeff_sign
[
j
];
dqcoeff_ptr
[
k
]
=
qcoeff_ptr
[
k
]
*
dequant_ptr
[
k
!=
0
];
if
(
abs_qcoeff
)
eob_i
=
iscan
[
k
]
>
eob_i
?
iscan
[
k
]
:
eob_i
;
}
}
}
...
...
@@ -105,6 +106,9 @@ void vpx_highbd_quantize_b_32x32_sse2(
const
int
zbin0_tmp
=
ROUND_POWER_OF_TWO
(
zbin_ptr
[
0
],
1
);
const
int
zbin1_tmp
=
ROUND_POWER_OF_TWO
(
zbin_ptr
[
1
],
1
);
(
void
)
scan
;
(
void
)
skip_block
;
assert
(
!
skip_block
);
zbins
[
0
]
=
_mm_set_epi32
(
zbin1_tmp
,
zbin1_tmp
,
zbin1_tmp
,
zbin0_tmp
);
zbins
[
1
]
=
_mm_set1_epi32
(
zbin1_tmp
);
...
...
@@ -116,38 +120,35 @@ void vpx_highbd_quantize_b_32x32_sse2(
memset
(
qcoeff_ptr
,
0
,
n_coeffs
*
sizeof
(
*
qcoeff_ptr
));
memset
(
dqcoeff_ptr
,
0
,
n_coeffs
*
sizeof
(
*
dqcoeff_ptr
));
if
(
!
skip_block
)
{
// Pre-scan pass
for
(
i
=
0
;
i
<
n_coeffs
/
4
;
i
++
)
{
__m128i
coeffs
,
cmp1
,
cmp2
;
int
test
;
coeffs
=
_mm_load_si128
((
const
__m128i
*
)(
coeff_ptr
+
i
*
4
));
cmp1
=
_mm_cmplt_epi32
(
coeffs
,
zbins
[
i
!=
0
]);
cmp2
=
_mm_cmpgt_epi32
(
coeffs
,
nzbins
[
i
!=
0
]);
cmp1
=
_mm_and_si128
(
cmp1
,
cmp2
);
test
=
_mm_movemask_epi8
(
cmp1
);
if
(
!
(
test
&
0xf
))
idx_arr
[
idx
++
]
=
i
*
4
;
if
(
!
(
test
&
0xf0
))
idx_arr
[
idx
++
]
=
i
*
4
+
1
;
if
(
!
(
test
&
0xf00
))
idx_arr
[
idx
++
]
=
i
*
4
+
2
;
if
(
!
(
test
&
0xf000
))
idx_arr
[
idx
++
]
=
i
*
4
+
3
;
}
// Pre-scan pass
for
(
i
=
0
;
i
<
n_coeffs
/
4
;
i
++
)
{
__m128i
coeffs
,
cmp1
,
cmp2
;
int
test
;
coeffs
=
_mm_load_si128
((
const
__m128i
*
)(
coeff_ptr
+
i
*
4
));
cmp1
=
_mm_cmplt_epi32
(
coeffs
,
zbins
[
i
!=
0
]);
cmp2
=
_mm_cmpgt_epi32
(
coeffs
,
nzbins
[
i
!=
0
]);
cmp1
=
_mm_and_si128
(
cmp1
,
cmp2
);
test
=
_mm_movemask_epi8
(
cmp1
);
if
(
!
(
test
&
0xf
))
idx_arr
[
idx
++
]
=
i
*
4
;
if
(
!
(
test
&
0xf0
))
idx_arr
[
idx
++
]
=
i
*
4
+
1
;
if
(
!
(
test
&
0xf00
))
idx_arr
[
idx
++
]
=
i
*
4
+
2
;
if
(
!
(
test
&
0xf000
))
idx_arr
[
idx
++
]
=
i
*
4
+
3
;
}
// Quantization pass: only process the coefficients selected in
// pre-scan pass. Note: idx can be zero.
for
(
i
=
0
;
i
<
idx
;
i
++
)
{
const
int
rc
=
idx_arr
[
i
];
const
int
coeff
=
coeff_ptr
[
rc
];
const
int
coeff_sign
=
(
coeff
>>
31
);
const
int
abs_coeff
=
(
coeff
^
coeff_sign
)
-
coeff_sign
;
const
int64_t
tmp1
=
abs_coeff
+
ROUND_POWER_OF_TWO
(
round_ptr
[
rc
!=
0
],
1
);
const
int64_t
tmp2
=
((
tmp1
*
quant_ptr
[
rc
!=
0
])
>>
16
)
+
tmp1
;
const
uint32_t
abs_qcoeff
=
(
uint32_t
)((
tmp2
*
quant_shift_ptr
[
rc
!=
0
])
>>
15
);
qcoeff_ptr
[
rc
]
=
(
int
)(
abs_qcoeff
^
coeff_sign
)
-
coeff_sign
;
dqcoeff_ptr
[
rc
]
=
qcoeff_ptr
[
rc
]
*
dequant_ptr
[
rc
!=
0
]
/
2
;
if
(
abs_qcoeff
)
eob
=
iscan
[
idx_arr
[
i
]]
>
eob
?
iscan
[
idx_arr
[
i
]]
:
eob
;
}
// Quantization pass: only process the coefficients selected in
// pre-scan pass. Note: idx can be zero.
for
(
i
=
0
;
i
<
idx
;
i
++
)
{
const
int
rc
=
idx_arr
[
i
];
const
int
coeff
=
coeff_ptr
[
rc
];
const
int
coeff_sign
=
(
coeff
>>
31
);
const
int
abs_coeff
=
(
coeff
^
coeff_sign
)
-
coeff_sign
;
const
int64_t
tmp1
=
abs_coeff
+
ROUND_POWER_OF_TWO
(
round_ptr
[
rc
!=
0
],
1
);
const
int64_t
tmp2
=
((
tmp1
*
quant_ptr
[
rc
!=
0
])
>>
16
)
+
tmp1
;
const
uint32_t
abs_qcoeff
=
(
uint32_t
)((
tmp2
*
quant_shift_ptr
[
rc
!=
0
])
>>
15
);
qcoeff_ptr
[
rc
]
=
(
int
)(
abs_qcoeff
^
coeff_sign
)
-
coeff_sign
;
dqcoeff_ptr
[
rc
]
=
qcoeff_ptr
[
rc
]
*
dequant_ptr
[
rc
!=
0
]
/
2
;
if
(
abs_qcoeff
)
eob
=
iscan
[
idx_arr
[
i
]]
>
eob
?
iscan
[
idx_arr
[
i
]]
:
eob
;
}
*
eob_ptr
=
eob
+
1
;
}
...
...
vpx_dsp/x86/quantize_avx_x86_64.asm
View file @
c02fdd02
...
...
@@ -19,10 +19,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
vzeroupper
; If we can skip this block, then just zero the output
cmp
skipmp
,
0
jne
.blank
%ifnidn %1, b_32x32
; Special case for ncoeff == 16, as it is frequent and we can save on
...
...
@@ -493,48 +489,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
mov
[
r2
],
ax
vzeroupper
RET
; Skip-block, i.e. just write all zeroes
.blank:
DEFINE_ARGS
coeff
,
ncoeff
,
skip
,
zbin
,
round
,
quant
,
shift
,
\
qcoeff
,
dqcoeff
,
dequant
,
eob
,
scan
,
iscan
mov
r0
,
dqcoeffmp
movifnidn
ncoeffq
,
ncoeffmp
mov
r2
,
qcoeffmp
mov
r3
,
eobmp
DEFINE_ARGS
dqcoeff
,
ncoeff
,
qcoeff
,
eob
%if CONFIG_VP9_HIGHBITDEPTH
lea
dqcoeffq
,
[
dqcoeffq
+
ncoeffq
*
4
]
lea
qcoeffq
,
[
qcoeffq
+
ncoeffq
*
4
]
%else
lea
dqcoeffq
,
[
dqcoeffq
+
ncoeffq
*
2
]
lea
qcoeffq
,
[
qcoeffq
+
ncoeffq
*
2
]
%endif
neg
ncoeffq
pxor
m7
,
m7
.blank_loop:
%if CONFIG_VP9_HIGHBITDEPTH
mova
[
dqcoeffq
+
ncoeffq
*
4
+
0
],
ymm7
mova
[
dqcoeffq
+
ncoeffq
*
4
+
32
],
ymm7
mova
[
qcoeffq
+
ncoeffq
*
4
+
0
],
ymm7
mova
[
qcoeffq
+
ncoeffq
*
4
+
32
],
ymm7
%else
mova
[
dqcoeffq
+
ncoeffq
*
2
+
0
],
ymm7
mova
[
qcoeffq
+
ncoeffq
*
2
+
0
],
ymm7
%endif
add
ncoeffq
,
mmsize
jl
.blank_loop
mov
[
eobq
],
word
0
vzeroupper
RET
%endmacro
INIT_XMM
avx
...
...
vpx_dsp/x86/quantize_sse2.c
View file @
c02fdd02
...
...
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include <assert.h>
#include <emmintrin.h>
#include <xmmintrin.h>
...
...
@@ -23,7 +24,12 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
uint16_t
*
eob_ptr
,
const
int16_t
*
scan_ptr
,
const
int16_t
*
iscan_ptr
)
{
__m128i
zero
;
__m128i
eob
;
__m128i
zbin
;
__m128i
round
,
quant
,
dequant
,
shift
;
(
void
)
scan_ptr
;
(
void
)
skip_block
;
assert
(
!
skip_block
);
coeff_ptr
+=
n_coeffs
;
iscan_ptr
+=
n_coeffs
;
...
...
@@ -31,193 +37,179 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
dqcoeff_ptr
+=
n_coeffs
;
n_coeffs
=
-
n_coeffs
;
zero
=
_mm_setzero_si128
();
if
(
!
skip_block
)
{
__m128i
eob
;
__m128i
zbin
;
__m128i
round
,
quant
,
dequant
,
shift
;
{
__m128i
coeff0
,
coeff1
;
// Setup global values
{
__m128i
pw_1
;
zbin
=
_mm_load_si128
((
const
__m128i
*
)
zbin_ptr
);
round
=
_mm_load_si128
((
const
__m128i
*
)
round_ptr
);
quant
=
_mm_load_si128
((
const
__m128i
*
)
quant_ptr
);
pw_1
=
_mm_set1_epi16
(
1
);
zbin
=
_mm_sub_epi16
(
zbin
,
pw_1
);
dequant
=
_mm_load_si128
((
const
__m128i
*
)
dequant_ptr
);
shift
=
_mm_load_si128
((
const
__m128i
*
)
quant_shift_ptr
);
}
{
__m128i
coeff0
,
coeff1
;
// Setup global values
{
__m128i
pw_1
;
zbin
=
_mm_load_si128
((
const
__m128i
*
)
zbin_ptr
);
round
=
_mm_load_si128
((
const
__m128i
*
)
round_ptr
);
quant
=
_mm_load_si128
((
const
__m128i
*
)
quant_ptr
);
pw_1
=
_mm_set1_epi16
(
1
);
zbin
=
_mm_sub_epi16
(
zbin
,
pw_1
);
dequant
=
_mm_load_si128
((
const
__m128i
*
)
dequant_ptr
);
shift
=
_mm_load_si128
((
const
__m128i
*
)
quant_shift_ptr
);
}
{
__m128i
coeff0_sign
,
coeff1_sign
;
__m128i
qcoeff0
,
qcoeff1
;
__m128i
qtmp0
,
qtmp1
;
__m128i
cmp_mask0
,
cmp_mask1
;
// Do DC and first 15 AC
coeff0
=
load_tran_low
(
coeff_ptr
+
n_coeffs
);
coeff1
=
load_tran_low
(
coeff_ptr
+
n_coeffs
+
8
);
// Poor man's sign extract
coeff0_sign
=
_mm_srai_epi16
(
coeff0
,
15
);
coeff1_sign
=
_mm_srai_epi16
(
coeff1
,
15
);
qcoeff0
=
_mm_xor_si128
(
coeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
coeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
cmp_mask0
=
_mm_cmpgt_epi16
(
qcoeff0
,
zbin
);
zbin
=
_mm_unpackhi_epi64
(
zbin
,
zbin
);
// Switch DC to AC
cmp_mask1
=
_mm_cmpgt_epi16
(
qcoeff1
,
zbin
);
qcoeff0
=
_mm_adds_epi16
(
qcoeff0
,
round
);
round
=
_mm_unpackhi_epi64
(
round
,
round
);
qcoeff1
=
_mm_adds_epi16
(
qcoeff1
,
round
);
qtmp0
=
_mm_mulhi_epi16
(
qcoeff0
,
quant
);
quant
=
_mm_unpackhi_epi64
(
quant
,
quant
);
qtmp1
=
_mm_mulhi_epi16
(
qcoeff1
,
quant
);
qtmp0
=
_mm_add_epi16
(
qtmp0
,
qcoeff0
);
qtmp1
=
_mm_add_epi16
(
qtmp1
,
qcoeff1
);
qcoeff0
=
_mm_mulhi_epi16
(
qtmp0
,
shift
);
shift
=
_mm_unpackhi_epi64
(
shift
,
shift
);
qcoeff1
=
_mm_mulhi_epi16
(
qtmp1
,
shift
);
// Reinsert signs
qcoeff0
=
_mm_xor_si128
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
qcoeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
// Mask out zbin threshold coeffs
qcoeff0
=
_mm_and_si128
(
qcoeff0
,
cmp_mask0
);
qcoeff1
=
_mm_and_si128
(
qcoeff1
,
cmp_mask1
);
store_tran_low
(
qcoeff0
,
qcoeff_ptr
+
n_coeffs
);
store_tran_low
(
qcoeff1
,
qcoeff_ptr
+
n_coeffs
+
8
);
coeff0
=
_mm_mullo_epi16
(
qcoeff0
,
dequant
);
dequant
=
_mm_unpackhi_epi64
(
dequant
,
dequant
);
coeff1
=
_mm_mullo_epi16
(
qcoeff1
,
dequant
);
store_tran_low
(
coeff0
,
dqcoeff_ptr
+
n_coeffs
);
store_tran_low
(
coeff1
,
dqcoeff_ptr
+
n_coeffs
+
8
);
}
{
// Scan for eob
__m128i
zero_coeff0
,
zero_coeff1
;
__m128i
nzero_coeff0
,
nzero_coeff1
;
__m128i
iscan0
,
iscan1
;
__m128i
eob1
;
zero_coeff0
=
_mm_cmpeq_epi16
(
coeff0
,
zero
);
zero_coeff1
=
_mm_cmpeq_epi16
(
coeff1
,
zero
);
nzero_coeff0
=
_mm_cmpeq_epi16
(
zero_coeff0
,
zero
);
nzero_coeff1
=
_mm_cmpeq_epi16
(
zero_coeff1
,
zero
);
iscan0
=
_mm_load_si128
((
const
__m128i
*
)(
iscan_ptr
+
n_coeffs
));
iscan1
=
_mm_load_si128
((
const
__m128i
*
)(
iscan_ptr
+
n_coeffs
)
+
1
);
// Add one to convert from indices to counts
iscan0
=
_mm_sub_epi16
(
iscan0
,
nzero_coeff0
);
iscan1
=
_mm_sub_epi16
(
iscan1
,
nzero_coeff1
);
eob
=
_mm_and_si128
(
iscan0
,
nzero_coeff0
);
eob1
=
_mm_and_si128
(
iscan1
,
nzero_coeff1
);
eob
=
_mm_max_epi16
(
eob
,
eob1
);
}
n_coeffs
+=
8
*
2
;
__m128i
coeff0_sign
,
coeff1_sign
;
__m128i
qcoeff0
,
qcoeff1
;
__m128i
qtmp0
,
qtmp1
;
__m128i
cmp_mask0
,
cmp_mask1
;
// Do DC and first 15 AC
coeff0
=
load_tran_low
(
coeff_ptr
+
n_coeffs
);
coeff1
=
load_tran_low
(
coeff_ptr
+
n_coeffs
+
8
);
// Poor man's sign extract
coeff0_sign
=
_mm_srai_epi16
(
coeff0
,
15
);
coeff1_sign
=
_mm_srai_epi16
(
coeff1
,
15
);
qcoeff0
=
_mm_xor_si128
(
coeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
coeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
cmp_mask0
=
_mm_cmpgt_epi16
(
qcoeff0
,
zbin
);
zbin
=
_mm_unpackhi_epi64
(
zbin
,
zbin
);
// Switch DC to AC
cmp_mask1
=
_mm_cmpgt_epi16
(
qcoeff1
,
zbin
);
qcoeff0
=
_mm_adds_epi16
(
qcoeff0
,
round
);
round
=
_mm_unpackhi_epi64
(
round
,
round
);
qcoeff1
=
_mm_adds_epi16
(
qcoeff1
,
round
);
qtmp0
=
_mm_mulhi_epi16
(
qcoeff0
,
quant
);
quant
=
_mm_unpackhi_epi64
(
quant
,
quant
);
qtmp1
=
_mm_mulhi_epi16
(
qcoeff1
,
quant
);
qtmp0
=
_mm_add_epi16
(
qtmp0
,
qcoeff0
);
qtmp1
=
_mm_add_epi16
(
qtmp1
,
qcoeff1
);
qcoeff0
=
_mm_mulhi_epi16
(
qtmp0
,
shift
);
shift
=
_mm_unpackhi_epi64
(
shift
,
shift
);
qcoeff1
=
_mm_mulhi_epi16
(
qtmp1
,
shift
);
// Reinsert signs
qcoeff0
=
_mm_xor_si128
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
qcoeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
// Mask out zbin threshold coeffs
qcoeff0
=
_mm_and_si128
(
qcoeff0
,
cmp_mask0
);
qcoeff1
=
_mm_and_si128
(
qcoeff1
,
cmp_mask1
);
store_tran_low
(
qcoeff0
,
qcoeff_ptr
+
n_coeffs
);
store_tran_low
(
qcoeff1
,
qcoeff_ptr
+
n_coeffs
+
8
);
coeff0
=
_mm_mullo_epi16
(
qcoeff0
,
dequant
);
dequant
=
_mm_unpackhi_epi64
(
dequant
,
dequant
);
coeff1
=
_mm_mullo_epi16
(
qcoeff1
,
dequant
);
store_tran_low
(
coeff0
,
dqcoeff_ptr
+
n_coeffs
);
store_tran_low
(
coeff1
,
dqcoeff_ptr
+
n_coeffs
+
8
);
}
// AC only loop
while
(
n_coeffs
<
0
)
{
__m128i
coeff0
,
coeff1
;
{
__m128i
coeff0_sign
,
coeff1_sign
;
__m128i
qcoeff0
,
qcoeff1
;
__m128i
qtmp0
,
qtmp1
;
__m128i
cmp_mask0
,
cmp_mask1
;
coeff0
=
load_tran_low
(
coeff_ptr
+
n_coeffs
);
coeff1
=
load_tran_low
(
coeff_ptr
+
n_coeffs
+
8
);
// Poor man's sign extract
coeff0_sign
=
_mm_srai_epi16
(
coeff0
,
15
);
coeff1_sign
=
_mm_srai_epi16
(
coeff1
,
15
);
qcoeff0
=
_mm_xor_si128
(
coeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
coeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
cmp_mask0
=
_mm_cmpgt_epi16
(
qcoeff0
,
zbin
);
cmp_mask1
=
_mm_cmpgt_epi16
(
qcoeff1
,
zbin
);
qcoeff0
=
_mm_adds_epi16
(
qcoeff0
,
round
);
qcoeff1
=
_mm_adds_epi16
(
qcoeff1
,
round
);
qtmp0
=
_mm_mulhi_epi16
(
qcoeff0
,
quant
);
qtmp1
=
_mm_mulhi_epi16
(
qcoeff1
,
quant
);
qtmp0
=
_mm_add_epi16
(
qtmp0
,
qcoeff0
);
qtmp1
=
_mm_add_epi16
(
qtmp1
,
qcoeff1
);
qcoeff0
=
_mm_mulhi_epi16
(
qtmp0
,
shift
);
qcoeff1
=
_mm_mulhi_epi16
(
qtmp1
,
shift
);
// Reinsert signs
qcoeff0
=
_mm_xor_si128
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
qcoeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
// Mask out zbin threshold coeffs
qcoeff0
=
_mm_and_si128
(
qcoeff0
,
cmp_mask0
);
qcoeff1
=
_mm_and_si128
(
qcoeff1
,
cmp_mask1
);
store_tran_low
(
qcoeff0
,
qcoeff_ptr
+
n_coeffs
);
store_tran_low
(
qcoeff1
,
qcoeff_ptr
+
n_coeffs
+
8
);
coeff0
=
_mm_mullo_epi16
(
qcoeff0
,
dequant
);
coeff1
=
_mm_mullo_epi16
(
qcoeff1
,
dequant
);
store_tran_low
(
coeff0
,
dqcoeff_ptr
+
n_coeffs
);
store_tran_low
(
coeff1
,
dqcoeff_ptr
+
n_coeffs
+
8
);
}
{
// Scan for eob
__m128i
zero_coeff0
,
zero_coeff1
;
__m128i
nzero_coeff0
,
nzero_coeff1
;
__m128i
iscan0
,
iscan1
;
__m128i
eob0
,
eob1
;
zero_coeff0
=
_mm_cmpeq_epi16
(
coeff0
,
zero
);
zero_coeff1
=
_mm_cmpeq_epi16
(
coeff1
,
zero
);
nzero_coeff0
=
_mm_cmpeq_epi16
(
zero_coeff0
,
zero
);
nzero_coeff1
=
_mm_cmpeq_epi16
(
zero_coeff1
,
zero
);
iscan0
=
_mm_load_si128
((
const
__m128i
*
)(
iscan_ptr
+
n_coeffs
));
iscan1
=
_mm_load_si128
((
const
__m128i
*
)(
iscan_ptr
+
n_coeffs
)
+
1
);
// Add one to convert from indices to counts
iscan0
=
_mm_sub_epi16
(
iscan0
,
nzero_coeff0
);
iscan1
=
_mm_sub_epi16
(
iscan1
,
nzero_coeff1
);
eob0
=
_mm_and_si128
(
iscan0
,
nzero_coeff0
);
eob1
=
_mm_and_si128
(
iscan1
,
nzero_coeff1
);
eob0
=
_mm_max_epi16
(
eob0
,
eob1
);
eob
=
_mm_max_epi16
(
eob
,
eob0
);
}
n_coeffs
+=
8
*
2
;
{
// Scan for eob
__m128i
zero_coeff0
,
zero_coeff1
;
__m128i
nzero_coeff0
,
nzero_coeff1
;
__m128i
iscan0
,
iscan1
;
__m128i
eob1
;
zero_coeff0
=
_mm_cmpeq_epi16
(
coeff0
,
zero
);
zero_coeff1
=
_mm_cmpeq_epi16
(
coeff1
,
zero
);
nzero_coeff0
=
_mm_cmpeq_epi16
(
zero_coeff0
,
zero
);
nzero_coeff1
=
_mm_cmpeq_epi16
(
zero_coeff1
,
zero
);
iscan0
=
_mm_load_si128
((
const
__m128i
*
)(
iscan_ptr
+
n_coeffs
));
iscan1
=
_mm_load_si128
((
const
__m128i
*
)(
iscan_ptr
+
n_coeffs
)
+
1
);
// Add one to convert from indices to counts
iscan0
=
_mm_sub_epi16
(
iscan0
,
nzero_coeff0
);
iscan1
=
_mm_sub_epi16
(
iscan1
,
nzero_coeff1
);
eob
=
_mm_and_si128
(
iscan0
,
nzero_coeff0
);
eob1
=
_mm_and_si128
(
iscan1
,
nzero_coeff1
);
eob
=
_mm_max_epi16
(
eob
,
eob1
);
}
n_coeffs
+=
8
*
2
;
}
// Accumulate EOB
// AC only loop
while
(
n_coeffs
<
0
)
{
__m128i
coeff0
,
coeff1
;
{
__m128i
eob_shuffled
;
eob_shuffled
=
_mm_shuffle_epi32
(
eob
,
0xe
);
eob
=
_mm_max_epi16
(
eob
,
eob_shuffled
);
eob_shuffled
=
_mm_shufflelo_epi16
(
eob
,
0xe
);
eob
=
_mm_max_epi16
(
eob
,
eob_shuffled
);
eob_shuffled
=
_mm_shufflelo_epi16
(
eob
,
0x1
);
eob
=
_mm_max_epi16
(
eob
,
eob_shuffled
);
*
eob_ptr
=
_mm_extract_epi16
(
eob
,
1
);
__m128i
coeff0_sign
,
coeff1_sign
;
__m128i
qcoeff0
,
qcoeff1
;
__m128i
qtmp0
,
qtmp1
;
__m128i
cmp_mask0
,
cmp_mask1
;
coeff0
=
load_tran_low
(
coeff_ptr
+
n_coeffs
);
coeff1
=
load_tran_low
(
coeff_ptr
+
n_coeffs
+
8
);
// Poor man's sign extract
coeff0_sign
=
_mm_srai_epi16
(
coeff0
,
15
);
coeff1_sign
=
_mm_srai_epi16
(
coeff1
,
15
);
qcoeff0
=
_mm_xor_si128
(
coeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
coeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
cmp_mask0
=
_mm_cmpgt_epi16
(
qcoeff0
,
zbin
);
cmp_mask1
=
_mm_cmpgt_epi16
(
qcoeff1
,
zbin
);
qcoeff0
=
_mm_adds_epi16
(
qcoeff0
,
round
);
qcoeff1
=
_mm_adds_epi16
(
qcoeff1
,
round
);
qtmp0
=
_mm_mulhi_epi16
(
qcoeff0
,
quant
);
qtmp1
=
_mm_mulhi_epi16
(
qcoeff1
,
quant
);
qtmp0
=
_mm_add_epi16
(
qtmp0
,
qcoeff0
);
qtmp1
=
_mm_add_epi16
(
qtmp1
,
qcoeff1
);
qcoeff0
=
_mm_mulhi_epi16
(
qtmp0
,
shift
);
qcoeff1
=
_mm_mulhi_epi16
(
qtmp1
,
shift
);
// Reinsert signs
qcoeff0
=
_mm_xor_si128
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_xor_si128
(
qcoeff1
,
coeff1_sign
);
qcoeff0
=
_mm_sub_epi16
(
qcoeff0
,
coeff0_sign
);
qcoeff1
=
_mm_sub_epi16
(
qcoeff1
,
coeff1_sign
);
// Mask out zbin threshold coeffs
qcoeff0
=
_mm_and_si128
(
qcoeff0
,
cmp_mask0
);
qcoeff1
=
_mm_and_si128
(
qcoeff1
,
cmp_mask1
);
store_tran_low
(
qcoeff0
,
qcoeff_ptr
+
n_coeffs
);
store_tran_low
(
qcoeff1
,
qcoeff_ptr
+
n_coeffs
+
8
);
coeff0
=
_mm_mullo_epi16
(
qcoeff0
,
dequant
);
coeff1
=
_mm_mullo_epi16
(
qcoeff1
,
dequant
);
store_tran_low
(
coeff0
,
dqcoeff_ptr
+
n_coeffs
);
store_tran_low
(
coeff1
,
dqcoeff_ptr
+
n_coeffs
+
8
);
}
}
else
{
do
{
store_tran_low
(
zero
,
dqcoeff_ptr
+
n_coeffs
);
store_tran_low
(
zero
,
dqcoeff_ptr
+
n_coeffs
+
8
);
store_tran_low
(
zero
,
qcoeff_ptr
+
n_coeffs
);