Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
BC
public
external
libvpx
Commits
127864de
Commit
127864de
authored
Oct 03, 2017
by
Linfeng Zhang
Browse files
Generalize 2:1 vp9_scale_and_extend_frame_ssse3()
Change-Id: I882da3a04884d5fabd4cd591c28682cbb2d76aa5
parent
b8094425
Changes
2
Hide whitespace changes
Inline
Side-by-side
vp9/encoder/x86/vp9_frame_scale_ssse3.c
View file @
127864de
...
...
@@ -15,14 +15,15 @@
#include "./vpx_scale_rtcd.h"
#include "vpx_dsp/x86/convolve_ssse3.h"
#include "vpx_dsp/x86/mem_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_scale/yv12config.h"
static
void
scale_plane_2_to_1_phase_0
(
const
uint8_t
*
src
,
const
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
const
ptrdiff_t
dst_stride
,
const
int
dst_w
,
const
int
dst_h
)
{
const
__m128i
mask
=
_mm_set1_epi16
(
0x00FF
);
const
int
max_width
=
(
dst_w
+
15
)
&
~
15
;
const
__m128i
mask
=
_mm_set1_epi16
(
0x00FF
);
int
y
=
dst_h
;
do
{
...
...
@@ -43,6 +44,174 @@ static void scale_plane_2_to_1_phase_0(const uint8_t *src,
}
while
(
--
y
);
}
static
INLINE
__m128i
scale_plane_bilinear_kernel
(
const
__m128i
*
const
s
,
const
__m128i
c0c1
)
{
const
__m128i
k_64
=
_mm_set1_epi16
(
1
<<
6
);
const
__m128i
t0
=
_mm_maddubs_epi16
(
s
[
0
],
c0c1
);
const
__m128i
t1
=
_mm_maddubs_epi16
(
s
[
1
],
c0c1
);
// round and shift by 7 bit each 16 bit
const
__m128i
t2
=
_mm_adds_epi16
(
t0
,
k_64
);
const
__m128i
t3
=
_mm_adds_epi16
(
t1
,
k_64
);
const
__m128i
t4
=
_mm_srai_epi16
(
t2
,
7
);
const
__m128i
t5
=
_mm_srai_epi16
(
t3
,
7
);
return
_mm_packus_epi16
(
t4
,
t5
);
}
static
void
scale_plane_2_to_1_bilinear
(
const
uint8_t
*
src
,
const
ptrdiff_t
src_stride
,
uint8_t
*
dst
,
const
ptrdiff_t
dst_stride
,
const
int
dst_w
,
const
int
dst_h
,
const
__m128i
c0c1
)
{
const
int
max_width
=
(
dst_w
+
15
)
&
~
15
;
int
y
=
dst_h
;
do
{
int
x
=
max_width
;
do
{
__m128i
s
[
2
],
d
[
2
];
// Horizontal
// Even rows
s
[
0
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
0
));
s
[
1
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
16
));
d
[
0
]
=
scale_plane_bilinear_kernel
(
s
,
c0c1
);
// odd rows
s
[
0
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
src_stride
+
0
));
s
[
1
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
src
+
src_stride
+
16
));
d
[
1
]
=
scale_plane_bilinear_kernel
(
s
,
c0c1
);
// Vertical
s
[
0
]
=
_mm_unpacklo_epi8
(
d
[
0
],
d
[
1
]);
s
[
1
]
=
_mm_unpackhi_epi8
(
d
[
0
],
d
[
1
]);
d
[
0
]
=
scale_plane_bilinear_kernel
(
s
,
c0c1
);
_mm_storeu_si128
((
__m128i
*
)
dst
,
d
[
0
]);
src
+=
32
;
dst
+=
16
;
x
-=
16
;
}
while
(
x
);
src
+=
2
*
(
src_stride
-
max_width
);
dst
+=
dst_stride
-
max_width
;
}
while
(
--
y
);
}
static
void
scale_plane_2_to_1_general
(
const
uint8_t
*
src
,
const
int
src_stride
,
uint8_t
*
dst
,
const
int
dst_stride
,
const
int
w
,
const
int
h
,
const
int16_t
*
const
coef
,
uint8_t
*
const
temp_buffer
)
{
const
int
width_hor
=
(
w
+
3
)
&
~
3
;
const
int
width_ver
=
(
w
+
7
)
&
~
7
;
const
int
height_hor
=
(
2
*
h
+
SUBPEL_TAPS
-
2
+
7
)
&
~
7
;
const
int
height_ver
=
(
h
+
3
)
&
~
3
;
int
x
,
y
=
height_hor
;
uint8_t
*
t
=
temp_buffer
;
__m128i
s
[
11
],
d
[
4
];
__m128i
f
[
4
];
assert
(
w
&&
h
);
shuffle_filter_ssse3
(
coef
,
f
);
src
-=
(
SUBPEL_TAPS
/
2
-
1
)
*
src_stride
+
SUBPEL_TAPS
/
2
+
1
;
// horizontal 4x8
do
{
load_8bit_8x8
(
src
+
2
,
src_stride
,
s
);
// 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
// 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
// 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
// 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 (overlapped)
transpose_16bit_4x8
(
s
,
s
);
x
=
width_hor
;
do
{
src
+=
8
;
load_8bit_8x8
(
src
,
src_stride
,
&
s
[
3
]);
// 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
// 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
// 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
// 0C 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
transpose_16bit_4x8
(
&
s
[
3
],
&
s
[
3
]);
d
[
0
]
=
convolve8_8_ssse3
(
&
s
[
0
],
f
);
// 00 10 20 30 40 50 60 70
d
[
1
]
=
convolve8_8_ssse3
(
&
s
[
1
],
f
);
// 01 11 21 31 41 51 61 71
d
[
2
]
=
convolve8_8_ssse3
(
&
s
[
2
],
f
);
// 02 12 22 32 42 52 62 72
d
[
3
]
=
convolve8_8_ssse3
(
&
s
[
3
],
f
);
// 03 13 23 33 43 53 63 73
// 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
// 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
d
[
0
]
=
_mm_packus_epi16
(
d
[
0
],
d
[
2
]);
d
[
1
]
=
_mm_packus_epi16
(
d
[
1
],
d
[
3
]);
// 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
// 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
d
[
2
]
=
_mm_unpacklo_epi16
(
d
[
0
],
d
[
1
]);
d
[
3
]
=
_mm_unpackhi_epi16
(
d
[
0
],
d
[
1
]);
store_8bit_4x4_sse2
(
d
[
2
],
t
+
0
,
2
*
width_hor
);
store_8bit_4x4_sse2
(
d
[
3
],
t
+
4
,
2
*
width_hor
);
s
[
0
]
=
s
[
4
];
s
[
1
]
=
s
[
5
];
s
[
2
]
=
s
[
6
];
t
+=
8
;
x
-=
4
;
}
while
(
x
);
src
+=
8
*
src_stride
-
2
*
width_hor
;
t
+=
6
*
width_hor
;
y
-=
8
;
}
while
(
y
);
// vertical 8x4
x
=
width_ver
;
t
=
temp_buffer
;
do
{
// 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
// 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
// 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
// 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 (overlapped)
loadu_8bit_16x4
(
t
,
2
*
width_hor
,
s
);
t
+=
6
*
width_hor
;
y
=
height_ver
;
do
{
// 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
// 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 77
// A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 77
// C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 77
loadu_8bit_16x4
(
t
,
2
*
width_hor
,
&
s
[
3
]);
t
+=
8
*
width_hor
;
d
[
0
]
=
convolve8_8_ssse3
(
&
s
[
0
],
f
);
d
[
1
]
=
convolve8_8_ssse3
(
&
s
[
1
],
f
);
d
[
2
]
=
convolve8_8_ssse3
(
&
s
[
2
],
f
);
d
[
3
]
=
convolve8_8_ssse3
(
&
s
[
3
],
f
);
// 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
// 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
d
[
0
]
=
_mm_packus_epi16
(
d
[
0
],
d
[
1
]);
d
[
1
]
=
_mm_packus_epi16
(
d
[
2
],
d
[
3
]);
_mm_storel_epi64
((
__m128i
*
)(
dst
+
0
*
dst_stride
),
d
[
0
]);
_mm_storeh_epi64
((
__m128i
*
)(
dst
+
1
*
dst_stride
),
d
[
0
]);
_mm_storel_epi64
((
__m128i
*
)(
dst
+
2
*
dst_stride
),
d
[
1
]);
_mm_storeh_epi64
((
__m128i
*
)(
dst
+
3
*
dst_stride
),
d
[
1
]);
s
[
0
]
=
s
[
4
];
s
[
1
]
=
s
[
5
];
s
[
2
]
=
s
[
6
];
dst
+=
4
*
dst_stride
;
y
-=
4
;
}
while
(
y
);
t
-=
width_hor
*
(
2
*
height_ver
+
6
);
t
+=
16
;
dst
-=
height_ver
*
dst_stride
;
dst
+=
8
;
x
-=
8
;
}
while
(
x
);
}
static
INLINE
__m128i
scale_1_to_2_phase_0_kernel
(
const
__m128i
*
const
s
,
const
__m128i
*
const
f
)
{
__m128i
ss
[
4
],
temp
;
...
...
@@ -165,17 +334,54 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
const
int
dst_h
=
dst
->
y_crop_height
;
int
scaled
=
0
;
if
(
dst_w
*
2
==
src_w
&&
dst_h
*
2
==
src_h
&&
phase_scaler
==
0
)
{
// phase_scaler is usually 0 or 8.
assert
(
phase_scaler
>=
0
&&
phase_scaler
<
16
);
if
(
dst_w
*
2
==
src_w
&&
dst_h
*
2
==
src_h
)
{
// 2 to 1
const
int
dst_uv_w
=
dst_w
/
2
;
const
int
dst_uv_h
=
dst_h
/
2
;
scaled
=
1
;
scale_plane_2_to_1_phase_0
(
src
->
y_buffer
,
src
->
y_stride
,
dst
->
y_buffer
,
dst
->
y_stride
,
dst_w
,
dst_h
);
scale_plane_2_to_1_phase_0
(
src
->
u_buffer
,
src
->
uv_stride
,
dst
->
u_buffer
,
dst
->
uv_stride
,
dst_uv_w
,
dst_uv_h
);
scale_plane_2_to_1_phase_0
(
src
->
v_buffer
,
src
->
uv_stride
,
dst
->
v_buffer
,
dst
->
uv_stride
,
dst_uv_w
,
dst_uv_h
);
if
(
phase_scaler
==
0
)
{
scale_plane_2_to_1_phase_0
(
src
->
y_buffer
,
src
->
y_stride
,
dst
->
y_buffer
,
dst
->
y_stride
,
dst_w
,
dst_h
);
scale_plane_2_to_1_phase_0
(
src
->
u_buffer
,
src
->
uv_stride
,
dst
->
u_buffer
,
dst
->
uv_stride
,
dst_uv_w
,
dst_uv_h
);
scale_plane_2_to_1_phase_0
(
src
->
v_buffer
,
src
->
uv_stride
,
dst
->
v_buffer
,
dst
->
uv_stride
,
dst_uv_w
,
dst_uv_h
);
}
else
if
(
filter_type
==
BILINEAR
)
{
const
int16_t
c0
=
vp9_filter_kernels
[
BILINEAR
][
phase_scaler
][
3
];
const
int16_t
c1
=
vp9_filter_kernels
[
BILINEAR
][
phase_scaler
][
4
];
const
__m128i
c0c1
=
_mm_set1_epi16
(
c0
|
(
c1
<<
8
));
// c0 and c1 >= 0
scale_plane_2_to_1_bilinear
(
src
->
y_buffer
,
src
->
y_stride
,
dst
->
y_buffer
,
dst
->
y_stride
,
dst_w
,
dst_h
,
c0c1
);
scale_plane_2_to_1_bilinear
(
src
->
u_buffer
,
src
->
uv_stride
,
dst
->
u_buffer
,
dst
->
uv_stride
,
dst_uv_w
,
dst_uv_h
,
c0c1
);
scale_plane_2_to_1_bilinear
(
src
->
v_buffer
,
src
->
uv_stride
,
dst
->
v_buffer
,
dst
->
uv_stride
,
dst_uv_w
,
dst_uv_h
,
c0c1
);
}
else
{
const
int
buffer_stride
=
(
dst_w
+
3
)
&
~
3
;
const
int
buffer_height
=
(
2
*
dst_h
+
SUBPEL_TAPS
-
2
+
7
)
&
~
7
;
uint8_t
*
const
temp_buffer
=
(
uint8_t
*
)
malloc
(
buffer_stride
*
buffer_height
);
if
(
temp_buffer
)
{
scale_plane_2_to_1_general
(
src
->
y_buffer
,
src
->
y_stride
,
dst
->
y_buffer
,
dst
->
y_stride
,
dst_w
,
dst_h
,
vp9_filter_kernels
[
filter_type
][
phase_scaler
],
temp_buffer
);
scale_plane_2_to_1_general
(
src
->
u_buffer
,
src
->
uv_stride
,
dst
->
u_buffer
,
dst
->
uv_stride
,
dst_uv_w
,
dst_uv_h
,
vp9_filter_kernels
[
filter_type
][
phase_scaler
],
temp_buffer
);
scale_plane_2_to_1_general
(
src
->
v_buffer
,
src
->
uv_stride
,
dst
->
v_buffer
,
dst
->
uv_stride
,
dst_uv_w
,
dst_uv_h
,
vp9_filter_kernels
[
filter_type
][
phase_scaler
],
temp_buffer
);
free
(
temp_buffer
);
}
else
{
scaled
=
0
;
}
}
}
else
if
(
dst_w
==
src_w
*
2
&&
dst_h
==
src_h
*
2
&&
phase_scaler
==
0
)
{
// 1 to 2
uint8_t
*
const
temp_buffer
=
(
uint8_t
*
)
malloc
(
8
*
((
src_w
+
7
)
&
~
7
));
...
...
vpx_dsp/x86/mem_sse2.h
View file @
127864de
...
...
@@ -55,16 +55,22 @@ static INLINE void load_8bit_16x8(const uint8_t *const s,
d
[
7
]
=
_mm_load_si128
((
const
__m128i
*
)(
s
+
7
*
stride
));
}
static
INLINE
void
loadu_8bit_16x
8
(
const
uint8_t
*
const
s
,
static
INLINE
void
loadu_8bit_16x
4
(
const
uint8_t
*
const
s
,
const
ptrdiff_t
stride
,
__m128i
*
const
d
)
{
d
[
0
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
s
+
0
*
stride
));
d
[
1
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
s
+
1
*
stride
));
d
[
2
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
s
+
2
*
stride
));
d
[
3
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
s
+
3
*
stride
));
d
[
4
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
s
+
4
*
stride
));
d
[
5
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
s
+
5
*
stride
));
d
[
6
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
s
+
6
*
stride
));
d
[
7
]
=
_mm_loadu_si128
((
const
__m128i
*
)(
s
+
7
*
stride
));
}
static
INLINE
void
loadu_8bit_16x8
(
const
uint8_t
*
const
s
,
const
ptrdiff_t
stride
,
__m128i
*
const
d
)
{
loadu_8bit_16x4
(
s
+
0
*
stride
,
stride
,
&
d
[
0
]);
loadu_8bit_16x4
(
s
+
4
*
stride
,
stride
,
&
d
[
4
]);
}
static
INLINE
void
_mm_storeh_epi64
(
__m128i
*
const
d
,
const
__m128i
s
)
{
_mm_storeh_pi
((
__m64
*
)
d
,
_mm_castsi128_ps
(
s
));
}
static
INLINE
void
store_8bit_4x4
(
const
__m128i
*
const
s
,
uint8_t
*
const
d
,
...
...
@@ -75,6 +81,17 @@ static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
*
(
int
*
)(
d
+
3
*
stride
)
=
_mm_cvtsi128_si32
(
s
[
3
]);
}
static
INLINE
void
store_8bit_4x4_sse2
(
const
__m128i
s
,
uint8_t
*
const
d
,
const
ptrdiff_t
stride
)
{
__m128i
ss
[
4
];
ss
[
0
]
=
s
;
ss
[
1
]
=
_mm_srli_si128
(
s
,
4
);
ss
[
2
]
=
_mm_srli_si128
(
s
,
8
);
ss
[
3
]
=
_mm_srli_si128
(
s
,
12
);
store_8bit_4x4
(
ss
,
d
,
stride
);
}
static
INLINE
void
store_8bit_8x8
(
const
__m128i
*
const
s
,
uint8_t
*
const
d
,
const
ptrdiff_t
stride
)
{
_mm_storel_epi64
((
__m128i
*
)(
d
+
0
*
stride
),
s
[
0
]);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment