Commit f059dcef authored by Guillaume BIENKOWSKI's avatar Guillaume BIENKOWSKI
Browse files

Add NEON tester when ARM_NEON is defined. This allows to check if some...

Add NEON tester when ARM_NEON is defined. This allows to check if some specific implementations are really well implemented and perform at the expected speed.
parent 1b5c06af
/*
mediastreamer2 library - modular sound and video processing and streaming
Copyright (C) 2006-2014 Belledonne Communications, Grenoble
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
#include <stdio.h>
#include <time.h>
#include "mediastreamer2_tester.h"
#include <speex/speex.h>
#include "libspeex/resample_neon.h"
#include <ortp/port.h>
#include <arm_neon.h>
#ifdef __ARM_NEON__
static int tester_init() {
ortp_set_log_level_mask(ORTP_MESSAGE | ORTP_WARNING | ORTP_ERROR | ORTP_FATAL);
ms_init();
srand(time(0));
return 0;
}
static int tester_cleanup() {
ms_exit();
return 0;
}
// tests the inner product with and without neon (using speex)
extern int libspeex_cpu_features;
//
spx_int32_t inner_product_neon_xcode(const spx_int16_t *a, const spx_int16_t *b, unsigned int len){
int16x4_t a4;
int16x4_t b4;
spx_int32_t sum = 0; // sum is 0
int32x4_t part = vdupq_n_s32(0);
int32x2_t dAccL, dAccH;
spx_int32_t s;
len >>=2;
while (len--) {
a4 = vld1_s16(a); // load 4 * 16b from a
b4 = vld1_s16(b); // load 4 * 16b from b
part = vmlal_s16(part, a4, b4); // part[n] += a[n] * b[n]
a+=4; b+=4;
}
/* Split 128-bit qAccumulator into 64-bit dAccL and dAccH for
* accumulation */
dAccL = vget_low_s32(part);
dAccH = vget_high_s32(part);
/* Accumulate 2 lanes in dAccL and dAccH into 2 lanes in dAccL */
dAccL = vadd_s32(dAccL, dAccH);
/* Accumulate 2 lanes in dAccL into first (and second) lane of dAccL */
dAccL = vpadd_s32(dAccL, dAccL);
s = vget_lane_s32(dAccL, 0);
// ms_message("S[%d] = %d", len,s);
/* Add accumulated value to retval */
sum += (s >> 6);
return sum;
}
spx_int32_t inner_product_neon_xcode_optim(const spx_int16_t *a, const spx_int16_t *b, unsigned int len){
int16x4_t a4;
int16x4_t b4;
spx_int32_t sum = 0; // sum is 0
int32x4_t partial = vdupq_n_s32(0);
int64x2_t back;
len >>=2;
while (len--) {
a4 = vld1_s16(a); // load 4 * 16b from a
b4 = vld1_s16(b); // load 4 * 16b from b
partial = vmlal_s16(partial, a4, b4); // part[n] += a[n] * b[n] vector multiply and add
a+=4; b+=4;
}
back = vpaddlq_s32(partial); // sum the 4 s32 in 2 64b
back = vshrq_n_s64(back, 6); // shift by 6
sum += vgetq_lane_s64(back, 0) + vgetq_lane_s64(back, 1);
partial = vdupq_n_s32(0); // reset partial sum
return sum;
}
spx_int32_t inner_product_neon_xcode_optim8(const spx_int16_t *a, const spx_int16_t *b, unsigned int len){
int16x8_t a8;
int16x8_t b8;
spx_int32_t sum = 0; // sum is 0
int32x4_t partial = vdupq_n_s32(0);
int64x2_t back;
len >>=3;
while (len--) {
a8 = vld1q_s16(a); // load 8 16b from a
b8 = vld1q_s16(b); // load 8 16b from b
partial = vmlal_s16(partial, vget_low_s16(a8), vget_low_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
partial = vmlal_s16(partial, vget_high_s16(a8), vget_high_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
a+=8; b+=8;
}
back = vpaddlq_s32(partial); // sum the 4 s32 in 2 64b
back = vshrq_n_s64(back, 6); // shift by 6
sum += vgetq_lane_s64(back, 0) + vgetq_lane_s64(back, 1);
partial = vdupq_n_s32(0); // reset partial sum
return sum;
}
spx_int32_t inner_product_neon_xcode_optim16(const spx_int16_t *a, const spx_int16_t *b, unsigned int len){
int16x8_t a8,b8;
int16x8_t c8,d8;
spx_int32_t sum = 0; // sum is 0
int32x4_t partial = vdupq_n_s32(0);
int32x4_t p2 = vdupq_n_s32(0);
int64x2_t back;
len >>=4;
while (len--) {
a8 = vld1q_s16(a); // load 8 16b from a
b8 = vld1q_s16(b); // load 8 16b from b
c8 = vld1q_s16(a+8); // load 8 16b from b
d8 = vld1q_s16(b+8); // load 8 16b from b
partial = vmlal_s16(partial, vget_low_s16(a8), vget_low_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
partial = vmlal_s16(partial, vget_high_s16(a8), vget_high_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
p2 = vmlal_s16(p2, vget_low_s16(c8), vget_low_s16(d8)); // part[n] += a[n] * b[n] vector multiply and add
p2 = vmlal_s16(p2, vget_high_s16(c8), vget_high_s16(d8)); // part[n] += a[n] * b[n] vector multiply and add
a+=16; b+=16;
}
back = vpaddlq_s32(partial); // sum the 4 s32 in 2 64b
back = vpadalq_s32(back, p2); // add to all these the 2nd partial
back = vshrq_n_s64(back, 6); // shift by 6
sum += vgetq_lane_s64(back, 0) + vgetq_lane_s64(back, 1);
partial = vdupq_n_s32(0); // reset partial sum
return sum;
}
/* intrinsics */
spx_int32_t inner_product_neon_intrinsics(const spx_int16_t *a, const spx_int16_t *b, unsigned int len){
int16x8_t a8;
int16x8_t b8;
int32x4_t partial = vdupq_n_s32(0);
int64x2_t back;
len >>=3;
while (len--) {
a8 = vld1q_s16(a); // load 8 16b from a
b8 = vld1q_s16(b); // load 8 16b from b
partial = vmlal_s16(partial, vget_low_s16(a8), vget_low_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
partial = vmlal_s16(partial, vget_high_s16(a8), vget_high_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
a+=8; b+=8;
}
back = vpaddlq_s32(partial); // sum the 4 s32 in 2 64b
back = vshrq_n_s64(back, 6); // shift by 6
return vgetq_lane_s64(back, 0) + vgetq_lane_s64(back, 1);
}
static void inner_product_test(void) {
#define SAMPLE_SIZE 64 /* has to be %8 and < 64 ! */
#define ITERATIONS 1000000
static spx_int16_t test_sample[SAMPLE_SIZE];
static spx_int16_t test_sample2[SAMPLE_SIZE];
int length = SAMPLE_SIZE;
uint64_t soft_ms, neon_ms;
// put some values to process
for( int i = 0; i<SAMPLE_SIZE; i++){
test_sample[i] = ortp_random() % 16384;
test_sample2[i] = ortp_random() % 16384;
}
if( (libspeex_cpu_features & SPEEX_LIB_CPU_FEATURE_NEON) != SPEEX_LIB_CPU_FEATURE_NEON){
ms_error("Can't test NEON features, skipping");
return;
}
// disable neon & perform inner product
libspeex_cpu_features &= ~SPEEX_LIB_CPU_FEATURE_NEON;
volatile spx_int32_t non_neon_result;
int i = ITERATIONS;
{
uint64_t start = ms_get_cur_time_ms();
while (i--) {
non_neon_result = inner_prod((const spx_int16_t*)test_sample, (const spx_int16_t*)test_sample2, length);
}
soft_ms = ms_get_cur_time_ms() - start;
}
// enable neon and perform the same operation
libspeex_cpu_features |= SPEEX_LIB_CPU_FEATURE_NEON;
i = ITERATIONS;
volatile spx_int32_t neon_result;
{
uint64_t start = ms_get_cur_time_ms();
while (i--) {
neon_result= inner_prod((const spx_int16_t*)test_sample, (const spx_int16_t*)test_sample2, length);
}
neon_ms = ms_get_cur_time_ms() - start;
}
float percent_off = ((float)abs(non_neon_result-neon_result))/MAX(non_neon_result, neon_result)*100;
ms_debug("XCode: %10d, NON Neon: %10d - diff: %d - percent off: %f",
non_neon_result, neon_result, abs(non_neon_result-neon_result), percent_off);
// we expect the result to be very similar and at least 5 times faster with NEON
CU_ASSERT(percent_off < 1.0);
CU_ASSERT((float)neon_ms < (float)soft_ms/5);
}
static test_t tests[] = {
{ "Inner product", inner_product_test }
};
test_suite_t neon_test_suite = {
"NEON",
tester_init,
tester_cleanup,
sizeof(tests)/sizeof(test_t),
tests
};
#endif // ARM NEON
...@@ -32,6 +32,12 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ...@@ -32,6 +32,12 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#ifdef __APPLE__ #ifdef __APPLE__
#include "TargetConditionals.h" #include "TargetConditionals.h"
#endif #endif
#if TARGET_OS_IPHONE
#import <UIKit/UIKit.h>
#include <AudioToolbox/AudioToolbox.h>
#include <CoreFoundation/CFRunLoop.h>
#endif
static test_suite_t **test_suite = NULL; static test_suite_t **test_suite = NULL;
static int nb_test_suites = 0; static int nb_test_suites = 0;
...@@ -128,7 +134,9 @@ void mediastreamer2_tester_init(void) { ...@@ -128,7 +134,9 @@ void mediastreamer2_tester_init(void) {
#endif #endif
add_test_suite(&framework_test_suite); add_test_suite(&framework_test_suite);
add_test_suite(&player_test_suite); add_test_suite(&player_test_suite);
#ifdef __ARM_NEON__
add_test_suite(&neon_test_suite);
#endif
for (i = 0; i < mediastreamer2_tester_nb_test_suites(); i++) { for (i = 0; i < mediastreamer2_tester_nb_test_suites(); i++) {
run_test_suite(test_suite[i]); run_test_suite(test_suite[i]);
} }
...@@ -221,6 +229,31 @@ void helper(const char *name) { ...@@ -221,6 +229,31 @@ void helper(const char *name) {
#ifndef WINAPI_FAMILY_PHONE_APP #ifndef WINAPI_FAMILY_PHONE_APP
#if TARGET_OS_MAC && !TARGET_OS_IPHONE #if TARGET_OS_MAC && !TARGET_OS_IPHONE
int _main (int argc, char *argv[]) { int _main (int argc, char *argv[]) {
#elif TARGET_OS_IPHONE
int g_argc;
char** g_argv;
static int _main(int argc, char * argv[]);
void stop_handler(int sig) {
return;
}
static void* apple_main(void* data) {
_main(g_argc,g_argv);
return NULL;
}
int main(int argc, char * argv[]) {
pthread_t main_thread;
g_argc=argc;
g_argv=argv;
pthread_create(&main_thread,NULL,apple_main,NULL);
NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
int value = UIApplicationMain(0, nil, nil, nil);
[pool release];
return value;
pthread_join(main_thread,NULL);
return 0;
}
static int _main(int argc, char * argv[]) {
#else #else
int main (int argc, char *argv[]) { int main (int argc, char *argv[]) {
#endif #endif
......
...@@ -54,6 +54,9 @@ extern test_suite_t audio_stream_test_suite; ...@@ -54,6 +54,9 @@ extern test_suite_t audio_stream_test_suite;
extern test_suite_t video_stream_test_suite; extern test_suite_t video_stream_test_suite;
extern test_suite_t framework_test_suite; extern test_suite_t framework_test_suite;
extern test_suite_t player_test_suite; extern test_suite_t player_test_suite;
#ifdef __ARM_NEON__
extern test_suite_t neon_test_suite;
#endif
#define CU_ASSERT_IN_RANGE(value, inf, sup) \ #define CU_ASSERT_IN_RANGE(value, inf, sup) \
CU_ASSERT_TRUE(value >= inf); \ CU_ASSERT_TRUE(value >= inf); \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment