mediastreamer2_neon_tester.c 8.01 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/*
 mediastreamer2 library - modular sound and video processing and streaming
 Copyright (C) 2006-2014 Belledonne Communications, Grenoble

 This program is free software; you can redistribute it and/or
 modify it under the terms of the GNU General Public License
 as published by the Free Software Foundation; either version 2
 of the License, or (at your option) any later version.

 This program is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program; if not, write to the Free Software
 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

#include <stdio.h>
#include <time.h>
#include "mediastreamer2_tester.h"
#include <speex/speex.h>
24
//#include "libspeex/resample_neon.h"
25 26 27
#include <ortp/port.h>

#ifdef __ARM_NEON__
Simon Morlat's avatar
Simon Morlat committed
28
#include <arm_neon.h>
29 30

static int tester_init() {
31 32 33 34
	ortp_set_log_level_mask(ORTP_MESSAGE | ORTP_WARNING | ORTP_ERROR | ORTP_FATAL);
	ms_init();
	srand(time(0));
	return 0;
35 36 37
}

static int tester_cleanup() {
38 39
	ms_exit();
	return 0;
40 41 42 43 44
}


// tests the inner product with and without neon (using speex)
extern int libspeex_cpu_features;
45
extern spx_int32_t inner_prod( const spx_int16_t* a, const spx_int16_t* b, int len);
46

47
#if 0 /* These are inner product implementations that we used for fixing speex's one */
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
spx_int32_t ms_inner_product_neon_xcode(const spx_int16_t *a, const spx_int16_t *b, unsigned int len) {
	int16x4_t a4;
	int16x4_t b4;
	spx_int32_t sum = 0; // sum is 0
	int32x4_t part = vdupq_n_s32(0);
	int32x2_t dAccL, dAccH;
	spx_int32_t s;

	len >>=2;

	while (len--) {
		a4 = vld1_s16(a); // load 4 * 16b from a
		b4 = vld1_s16(b); // load 4 * 16b from b
		part = vmlal_s16(part, a4, b4); // part[n] += a[n] * b[n]
		a+=4;
		b+=4;
	}
	/* Split 128-bit qAccumulator into 64-bit dAccL and dAccH for
	 * accumulation */
	dAccL = vget_low_s32(part);
	dAccH = vget_high_s32(part);

	/* Accumulate 2 lanes in dAccL and dAccH into 2 lanes in dAccL */
	dAccL = vadd_s32(dAccL, dAccH);
	/* Accumulate 2 lanes in dAccL into first (and second) lane of dAccL */
	dAccL = vpadd_s32(dAccL, dAccL);

	s = vget_lane_s32(dAccL, 0);
	// ms_message("S[%d] = %d", len,s);

	/* Add accumulated value to retval */
	sum += (s >> 6);

	return sum;
82 83 84

}

85 86 87 88 89 90
spx_int32_t ms_inner_product_neon_xcode_optim(const spx_int16_t *a, const spx_int16_t *b, unsigned int len) {
	int16x4_t a4;
	int16x4_t b4;
	spx_int32_t sum = 0; // sum is 0
	int32x4_t partial = vdupq_n_s32(0);
	int64x2_t back;
91

92
	len >>=2;
93

94 95 96
	while (len--) {
		a4 = vld1_s16(a); // load 4 * 16b from a
		b4 = vld1_s16(b); // load 4 * 16b from b
97

98 99 100 101 102 103
		partial = vmlal_s16(partial, a4, b4); // part[n] += a[n] * b[n] vector multiply and add
		a+=4;
		b+=4;
	}
	back = vpaddlq_s32(partial); // sum the 4 s32 in 2 64b
	back = vshrq_n_s64(back, 6); // shift by 6
104

105 106 107 108
	sum += vgetq_lane_s64(back, 0) + vgetq_lane_s64(back, 1);
	partial = vdupq_n_s32(0); // reset partial sum

	return sum;
109 110 111

}

112 113 114 115 116 117 118 119
spx_int32_t ms_inner_product_neon_xcode_optim8(const spx_int16_t *a, const spx_int16_t *b, unsigned int len) {
	int16x8_t a8;
	int16x8_t b8;
	spx_int32_t sum = 0; // sum is 0
	int32x4_t partial = vdupq_n_s32(0);
	int64x2_t back;

	len >>=3;
120

121 122 123
	while (len--) {
		a8 = vld1q_s16(a); // load 8 16b from a
		b8 = vld1q_s16(b); // load 8 16b from b
124

125 126 127 128 129 130 131
		partial = vmlal_s16(partial, vget_low_s16(a8), vget_low_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
		partial = vmlal_s16(partial, vget_high_s16(a8), vget_high_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
		a+=8;
		b+=8;
	}
	back = vpaddlq_s32(partial); // sum the 4 s32 in 2 64b
	back = vshrq_n_s64(back, 6); // shift by 6
132

133 134
	sum += vgetq_lane_s64(back, 0) + vgetq_lane_s64(back, 1);
	partial = vdupq_n_s32(0); // reset partial sum
135

136
	return sum;
137 138 139

}

140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
spx_int32_t ms_inner_product_neon_xcode_optim16(const spx_int16_t *a, const spx_int16_t *b, unsigned int len) {
	int16x8_t a8,b8;
	int16x8_t c8,d8;
	spx_int32_t sum = 0; // sum is 0
	int32x4_t partial = vdupq_n_s32(0);
	int32x4_t p2 =  vdupq_n_s32(0);
	int64x2_t back;

	len >>=4;

	while (len--) {
		a8 = vld1q_s16(a); // load 8 16b from a
		b8 = vld1q_s16(b); // load 8 16b from b
		c8 = vld1q_s16(a+8); // load 8 16b from b
		d8 = vld1q_s16(b+8); // load 8 16b from b

		partial = vmlal_s16(partial, vget_low_s16(a8), vget_low_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
		partial = vmlal_s16(partial, vget_high_s16(a8), vget_high_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
		p2 = vmlal_s16(p2, vget_low_s16(c8), vget_low_s16(d8)); // part[n] += a[n] * b[n] vector multiply and add
		p2 = vmlal_s16(p2, vget_high_s16(c8), vget_high_s16(d8)); // part[n] += a[n] * b[n] vector multiply and add

		a+=16;
		b+=16;
	}
	back = vpaddlq_s32(partial); // sum the 4 s32 in 2 64b
	back = vpadalq_s32(back, p2); // add to all these the 2nd partial
	back = vshrq_n_s64(back, 6); // shift by 6

	sum += vgetq_lane_s64(back, 0) + vgetq_lane_s64(back, 1);
	partial = vdupq_n_s32(0); // reset partial sum

	return sum;

173 174 175
}

/* intrinsics */
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
spx_int32_t ms_inner_product_neon_intrinsics(const spx_int16_t *a, const spx_int16_t *b, unsigned int len) {
	int16x8_t a8;
	int16x8_t b8;
	int32x4_t partial = vdupq_n_s32(0);
	int64x2_t back;

	len >>=3;

	while (len--) {
		a8 = vld1q_s16(a); // load 8 16b from a
		b8 = vld1q_s16(b); // load 8 16b from b

		partial = vmlal_s16(partial, vget_low_s16(a8), vget_low_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
		partial = vmlal_s16(partial, vget_high_s16(a8), vget_high_s16(b8)); // part[n] += a[n] * b[n] vector multiply and add
		a+=8;
		b+=8;
	}
	back = vpaddlq_s32(partial); // sum the 4 s32 in 2 64b
	back = vshrq_n_s64(back, 6); // shift by 6

	return vgetq_lane_s64(back, 0) + vgetq_lane_s64(back, 1);
197
}
198
#endif
199 200 201


static void inner_product_test(void) {
202
#ifdef SPEEX_LIB_CPU_FEATURE_NEON
203 204
#define SAMPLE_SIZE 64 /* has to be %8 and < 64 ! */
#define ITERATIONS 1000000
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
	static spx_int16_t test_sample[SAMPLE_SIZE];
	static spx_int16_t test_sample2[SAMPLE_SIZE];
	int length = SAMPLE_SIZE;
	uint64_t soft_ms, neon_ms;
	int i;
	volatile spx_int32_t non_neon_result;
	volatile spx_int32_t neon_result;
	float percent_off;
	bool_t fast_enough;

	// put some values to process
	for( i = 0; i<SAMPLE_SIZE; i++) {
		test_sample[i] = ortp_random() % 16384;
		test_sample2[i] = ortp_random() % 16384;
	}

	// ARM_NEON is enabled, we can force libspeex to use it
	libspeex_cpu_features = SPEEX_LIB_CPU_FEATURE_NEON;

	// disable neon & perform inner product
	libspeex_cpu_features &= ~SPEEX_LIB_CPU_FEATURE_NEON;
	i = ITERATIONS;
	{
		uint64_t start = ms_get_cur_time_ms();
		while (i--) {
			non_neon_result = inner_prod((const spx_int16_t*)test_sample, (const spx_int16_t*)test_sample2, length);
		}
		soft_ms = ms_get_cur_time_ms() - start;
	}

	// enable neon and perform the same operation
	libspeex_cpu_features |= SPEEX_LIB_CPU_FEATURE_NEON;
	i = ITERATIONS;
	{
		uint64_t start = ms_get_cur_time_ms();
		while (i--) {
			neon_result= inner_prod((const spx_int16_t*)test_sample, (const spx_int16_t*)test_sample2, length);
		}
		neon_ms = ms_get_cur_time_ms() - start;
	}

	percent_off = ((float)abs(non_neon_result-neon_result))/MAX(non_neon_result, neon_result)*100;
	ms_debug("%10d, NON Neon: %10d - diff: %d - percent off: %f",
	         non_neon_result, neon_result, abs(non_neon_result-neon_result), percent_off);

	fast_enough = (float)neon_ms < (float)soft_ms/5;

	// we expect the result to be very similar and at least 5 times faster with NEON
	CU_ASSERT(percent_off < 1.0);
	CU_ASSERT(fast_enough);
	ms_message("NEON = %llu ms, SOFT: %llu ms", neon_ms, soft_ms);
	if( !fast_enough ) {
		ms_error("NEON not fast enough it seems");
	}
259
#else
260
	ms_warning("Test skipped, not using the speex from git://git.linphone.org/speex.git with NEON support");
261
#endif
262 263 264 265

}

static test_t tests[] = {
266
	{ "Inner product", inner_product_test }
267 268 269
};

test_suite_t neon_test_suite = {
270 271 272 273 274
	"NEON",
	tester_init,
	tester_cleanup,
	sizeof(tests)/sizeof(test_t),
	tests
275 276
};

277
#endif // ARM NEON