Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
BC
public
external
libvpx
Commits
037e596f
Commit
037e596f
authored
Oct 24, 2017
by
Kyle Siefring
Committed by
Gerrit Code Review
Oct 24, 2017
Browse files
Merge "Optimize convolve8 SSSE3 and AVX2 intrinsics"
parents
e0aa6b24
ae35425a
Changes
4
Hide whitespace changes
Inline
Side-by-side
test/convolve_test.cc
View file @
037e596f
...
...
@@ -603,6 +603,75 @@ TEST_P(ConvolveTest, DISABLED_Scale_Speed) {
UUT_
->
use_highbd_
?
UUT_
->
use_highbd_
:
8
,
elapsed_time
);
}
TEST_P
(
ConvolveTest
,
DISABLED_8Tap_Speed
)
{
const
uint8_t
*
const
in
=
input
();
uint8_t
*
const
out
=
output
();
const
InterpKernel
*
const
eighttap
=
vp9_filter_kernels
[
EIGHTTAP_SHARP
];
const
int
kNumTests
=
5000000
;
const
int
width
=
Width
();
const
int
height
=
Height
();
vpx_usec_timer
timer
;
SetConstantInput
(
127
);
vpx_usec_timer_start
(
&
timer
);
for
(
int
n
=
0
;
n
<
kNumTests
;
++
n
)
{
UUT_
->
hv8_
[
0
](
in
,
kInputStride
,
out
,
kOutputStride
,
eighttap
,
8
,
16
,
8
,
16
,
width
,
height
);
}
vpx_usec_timer_mark
(
&
timer
);
const
int
elapsed_time
=
static_cast
<
int
>
(
vpx_usec_timer_elapsed
(
&
timer
));
printf
(
"convolve8_%dx%d_%d: %d us
\n
"
,
width
,
height
,
UUT_
->
use_highbd_
?
UUT_
->
use_highbd_
:
8
,
elapsed_time
);
}
TEST_P
(
ConvolveTest
,
DISABLED_8Tap_Horiz_Speed
)
{
const
uint8_t
*
const
in
=
input
();
uint8_t
*
const
out
=
output
();
const
InterpKernel
*
const
eighttap
=
vp9_filter_kernels
[
EIGHTTAP_SHARP
];
const
int
kNumTests
=
5000000
;
const
int
width
=
Width
();
const
int
height
=
Height
();
vpx_usec_timer
timer
;
SetConstantInput
(
127
);
vpx_usec_timer_start
(
&
timer
);
for
(
int
n
=
0
;
n
<
kNumTests
;
++
n
)
{
UUT_
->
h8_
[
0
](
in
,
kInputStride
,
out
,
kOutputStride
,
eighttap
,
8
,
16
,
8
,
16
,
width
,
height
);
}
vpx_usec_timer_mark
(
&
timer
);
const
int
elapsed_time
=
static_cast
<
int
>
(
vpx_usec_timer_elapsed
(
&
timer
));
printf
(
"convolve8_horiz_%dx%d_%d: %d us
\n
"
,
width
,
height
,
UUT_
->
use_highbd_
?
UUT_
->
use_highbd_
:
8
,
elapsed_time
);
}
TEST_P
(
ConvolveTest
,
DISABLED_8Tap_Vert_Speed
)
{
const
uint8_t
*
const
in
=
input
();
uint8_t
*
const
out
=
output
();
const
InterpKernel
*
const
eighttap
=
vp9_filter_kernels
[
EIGHTTAP_SHARP
];
const
int
kNumTests
=
5000000
;
const
int
width
=
Width
();
const
int
height
=
Height
();
vpx_usec_timer
timer
;
SetConstantInput
(
127
);
vpx_usec_timer_start
(
&
timer
);
for
(
int
n
=
0
;
n
<
kNumTests
;
++
n
)
{
UUT_
->
v8_
[
0
](
in
,
kInputStride
,
out
,
kOutputStride
,
eighttap
,
8
,
16
,
8
,
16
,
width
,
height
);
}
vpx_usec_timer_mark
(
&
timer
);
const
int
elapsed_time
=
static_cast
<
int
>
(
vpx_usec_timer_elapsed
(
&
timer
));
printf
(
"convolve8_vert_%dx%d_%d: %d us
\n
"
,
width
,
height
,
UUT_
->
use_highbd_
?
UUT_
->
use_highbd_
:
8
,
elapsed_time
);
}
TEST_P
(
ConvolveTest
,
DISABLED_8Tap_Avg_Speed
)
{
const
uint8_t
*
const
in
=
input
();
uint8_t
*
const
out
=
output
();
...
...
vpx_dsp/x86/convolve_avx2.h
View file @
037e596f
...
...
@@ -58,16 +58,19 @@ static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
const
__m256i
x1
=
_mm256_maddubs_epi16
(
s
[
1
],
f
[
1
]);
const
__m256i
x2
=
_mm256_maddubs_epi16
(
s
[
2
],
f
[
2
]);
const
__m256i
x3
=
_mm256_maddubs_epi16
(
s
[
3
],
f
[
3
]);
// add and saturate the results together
const
__m256i
min_x2x1
=
_mm256_min_epi16
(
x2
,
x1
);
const
__m256i
max_x2x1
=
_mm256_max_epi16
(
x2
,
x1
);
__m256i
temp
=
_mm256_adds_epi16
(
x0
,
x3
);
temp
=
_mm256_adds_epi16
(
temp
,
min_x2x1
);
temp
=
_mm256_adds_epi16
(
temp
,
max_x2x1
);
__m256i
sum1
,
sum2
;
// sum the results together, saturating only on the final step
// adding x0 with x2 and x1 with x3 is the only order that prevents
// outranges for all filters
sum1
=
_mm256_add_epi16
(
x0
,
x2
);
sum2
=
_mm256_add_epi16
(
x1
,
x3
);
// add the rounding offset early to avoid another saturated add
sum1
=
_mm256_add_epi16
(
sum1
,
k_64
);
sum1
=
_mm256_adds_epi16
(
sum1
,
sum2
);
// round and shift by 7 bit each 16 bit
temp
=
_mm256_adds_epi16
(
temp
,
k_64
);
temp
=
_mm256_srai_epi16
(
temp
,
7
);
return
temp
;
sum1
=
_mm256_srai_epi16
(
sum1
,
7
);
return
sum1
;
}
static
INLINE
__m128i
convolve8_8_avx2
(
const
__m256i
*
const
s
,
...
...
@@ -82,16 +85,19 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
_mm256_castsi256_si128
(
f
[
2
]));
const
__m128i
x3
=
_mm_maddubs_epi16
(
_mm256_castsi256_si128
(
s
[
3
]),
_mm256_castsi256_si128
(
f
[
3
]));
// add and saturate the results together
const
__m128i
min_x2x1
=
_mm_min_epi16
(
x2
,
x1
);
const
__m128i
max_x2x1
=
_mm_max_epi16
(
x2
,
x1
);
__m128i
temp
=
_mm_adds_epi16
(
x0
,
x3
);
temp
=
_mm_adds_epi16
(
temp
,
min_x2x1
);
temp
=
_mm_adds_epi16
(
temp
,
max_x2x1
);
// round and shift by 7 bit each 16 bit
temp
=
_mm_adds_epi16
(
temp
,
k_64
);
temp
=
_mm_srai_epi16
(
temp
,
7
);
return
temp
;
__m128i
sum1
,
sum2
;
// sum the results together, saturating only on the final step
// adding x0 with x2 and x1 with x3 is the only order that prevents
// outranges for all filters
sum1
=
_mm_add_epi16
(
x0
,
x2
);
sum2
=
_mm_add_epi16
(
x1
,
x3
);
// add the rounding offset early to avoid another saturated add
sum1
=
_mm_add_epi16
(
sum1
,
k_64
);
sum1
=
_mm_adds_epi16
(
sum1
,
sum2
);
// shift by 7 bit each 16 bit
sum1
=
_mm_srai_epi16
(
sum1
,
7
);
return
sum1
;
}
#undef MM256_BROADCASTSI128_SI256
...
...
vpx_dsp/x86/convolve_ssse3.h
View file @
037e596f
...
...
@@ -48,16 +48,19 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
const
__m128i
x1
=
_mm_maddubs_epi16
(
s
[
1
],
f
[
1
]);
const
__m128i
x2
=
_mm_maddubs_epi16
(
s
[
2
],
f
[
2
]);
const
__m128i
x3
=
_mm_maddubs_epi16
(
s
[
3
],
f
[
3
]);
// add and saturate the results together
const
__m128i
min_x2x1
=
_mm_min_epi16
(
x2
,
x1
);
const
__m128i
max_x2x1
=
_mm_max_epi16
(
x2
,
x1
);
__m128i
temp
=
_mm_adds_epi16
(
x0
,
x3
);
temp
=
_mm_adds_epi16
(
temp
,
min_x2x1
);
temp
=
_mm_adds_epi16
(
temp
,
max_x2x1
);
// round and shift by 7 bit each 16 bit
temp
=
_mm_adds_epi16
(
temp
,
k_64
);
temp
=
_mm_srai_epi16
(
temp
,
7
);
return
temp
;
__m128i
sum1
,
sum2
;
// sum the results together, saturating only on the final step
// adding x0 with x2 and x1 with x3 is the only order that prevents
// outranges for all filters
sum1
=
_mm_add_epi16
(
x0
,
x2
);
sum2
=
_mm_add_epi16
(
x1
,
x3
);
// add the rounding offset early to avoid another saturated add
sum1
=
_mm_add_epi16
(
sum1
,
k_64
);
sum1
=
_mm_adds_epi16
(
sum1
,
sum2
);
// shift by 7 bit each 16 bit
sum1
=
_mm_srai_epi16
(
sum1
,
7
);
return
sum1
;
}
static
INLINE
__m128i
convolve8_8_even_offset_ssse3
(
const
__m128i
*
const
s
,
...
...
vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
View file @
037e596f
...
...
@@ -38,8 +38,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
const
uint8_t
*
src_ptr
,
ptrdiff_t
src_pitch
,
uint8_t
*
output_ptr
,
ptrdiff_t
output_pitch
,
uint32_t
output_height
,
const
int16_t
*
filter
)
{
__m128i
firstFilters
,
secondFilters
,
shuffle1
,
shuffle2
;
__m128i
srcRegFilt1
,
srcRegFilt2
,
srcRegFilt3
,
srcRegFilt4
;
__m128i
addFilterReg64
,
filtersReg
,
srcReg
,
minReg
;
__m128i
srcRegFilt1
,
srcRegFilt2
;
__m128i
addFilterReg64
,
filtersReg
,
srcReg
;
unsigned
int
i
;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
...
...
@@ -75,18 +75,16 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
srcRegFilt1
=
_mm_maddubs_epi16
(
srcRegFilt1
,
firstFilters
);
srcRegFilt2
=
_mm_maddubs_epi16
(
srcRegFilt2
,
secondFilters
);
//
extract the higher half
o
f
the
lane
srcRegFilt3
=
_mm_srli_si128
(
srcRegFilt1
,
8
);
srcRegFilt
4
=
_mm_
srli_si128
(
srcRegFilt
2
,
8
);
//
sum the results together, saturating only
o
n
the
final step
// the specific order of the additions prevents outranges
srcRegFilt
1
=
_mm_
add_epi16
(
srcRegFilt
1
,
srcRegFilt2
);
minReg
=
_mm_min_epi16
(
srcRegFilt3
,
srcRegFilt2
);
// extract the higher half of the register
srcRegFilt2
=
_mm_srli_si128
(
srcRegFilt1
,
8
);
// add and saturate all the results together
srcRegFilt1
=
_mm_adds_epi16
(
srcRegFilt1
,
srcRegFilt4
);
srcRegFilt3
=
_mm_max_epi16
(
srcRegFilt3
,
srcRegFilt2
);
srcRegFilt1
=
_mm_adds_epi16
(
srcRegFilt1
,
minReg
);
srcRegFilt1
=
_mm_adds_epi16
(
srcRegFilt1
,
srcRegFilt3
);
srcRegFilt1
=
_mm_adds_epi16
(
srcRegFilt1
,
addFilterReg64
);
// add the rounding offset early to avoid another saturated add
srcRegFilt1
=
_mm_add_epi16
(
srcRegFilt1
,
addFilterReg64
);
srcRegFilt1
=
_mm_adds_epi16
(
srcRegFilt1
,
srcRegFilt2
);
// shift by 7 bit each 16 bits
srcRegFilt1
=
_mm_srai_epi16
(
srcRegFilt1
,
7
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment