Commit 8223d878 authored by Erwan Croze's avatar Erwan Croze 👋🏻

Add voice silence detection with speex in msvolume filter

parent 7cb43670
Pipeline #377 failed with stage
in 0 seconds
......@@ -694,7 +694,6 @@ typedef struct _MSPinFormat{
**/
#define MS_FILTER_OUTPUT_FMT_CHANGED MS_FILTER_BASE_EVENT_NO_ARG(0) /**<triggered whenever a filter decides to change its output format for one or more more output pins*/
/* DEPRECATED specific methods: to be moved into implementation specific header files - DO NOT USE IN NEW CODE*/
#define MS_FILTER_SET_FILTERLENGTH MS_FILTER_BASE_METHOD(12,int)
#define MS_FILTER_SET_OUTPUT_SAMPLE_RATE MS_FILTER_BASE_METHOD(13,int)
......
......@@ -81,6 +81,21 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#define MS_VOLUME_DB_LOWEST (-120) /*arbitrary value returned when linear volume is 0*/
/**
* Enable/disable silence detection
**/
#define MS_VOLUME_ENABLE_SILENCE_DETECTION MS_FILTER_METHOD(MS_VOLUME_ID,20,int)
/**
* Set threshold duration in millisec
**/
#define MS_VOLUME_SET_SILENCE_DURATION_THRESHOLD MS_FILTER_METHOD(MS_VOLUME_ID,21,unsigned int)
/**
* Triggered when we detect a silence during a specified time
**/
#define MS_VOLUME_EVENT_SILENCE_DETECTED MS_FILTER_EVENT_NO_ARG(MS_VOLUME_ID,0)
extern MSFilterDesc ms_volume_desc;
#endif
......@@ -61,6 +61,10 @@ typedef struct Volume{
float target_gain; /*the target gain choosed by echo limiter and noise gate*/
int sustain_time; /* time in ms for which echo limiter remains active after resuming from speech to silence.*/
int sustain_dur;
unsigned int silence_duration; // silence threshold duration in ms
int silence_detection_enable; // silence detection enabled information
int silence_event_send;
uint64_t last_voice_detection; // last time voice was detected
MSFilter *peer;
#ifdef HAVE_SPEEXDSP
SpeexPreprocessState *speex_pp;
......@@ -107,6 +111,10 @@ static void volume_init(MSFilter *f){
v->ng_floorgain=min_ng_floorgain;
v->ng_gain = 1;
v->remove_dc=FALSE;
v->silence_detection_enable = 0;
v->silence_duration = 0;
v->last_voice_detection = 0;
v->silence_event_send = 0;
#ifdef HAVE_SPEEXDSP
v->speex_pp=NULL;
#endif
......@@ -163,15 +171,18 @@ static int volume_get_linear(MSFilter *f, void *arg){
*farg = v->energy;
return 0;
}
// use our builtin agc
#if 0
static float volume_agc_process(Volume *v, mblk_t *om){
static float volume_agc_process(MSFilter *f, mblk_t *om){
Volume *v = (Volume*) f->data;
speex_preprocess_run(v->speex_pp,(int16_t*)om->b_rptr);
return 1;
}
#else
static float volume_agc_process(Volume *v, mblk_t *om) {
static float volume_agc_process(MSFilter *f, mblk_t *om) {
Volume *v = (Volume*) f->data;
static int counter;
// target is: 1
float gain_reduct = (agc_threshold + v->level_pk) / 1;
......@@ -180,11 +191,31 @@ static float volume_agc_process(Volume *v, mblk_t *om) {
ms_debug("_level=%f, gain reduction=%f, gain=%f, ng_gain=%f %f %f",
v->level_pk, gain_reduct, v->gain, v->ng_gain, v->ng_threshold, v->static_gain);
}
return gain_reduct;
}
#endif
static void volume_vad_process(MSFilter *f, mblk_t *om) {
#ifdef HAVE_SPEEXDSP
Volume *v = (Volume*) f->data;
if (v->speex_pp && v->silence_duration > 0) {
// Voice detected
if (speex_preprocess_run(v->speex_pp,(int16_t*)om->b_rptr)) {
v->last_voice_detection = f->ticker->time;
v->silence_event_send = 0;
} else if ((v->last_voice_detection + v->silence_duration) <= f->ticker->time) {
if (!v->silence_event_send) {
ms_filter_notify_no_arg(f, MS_VOLUME_EVENT_SILENCE_DETECTED);
ms_message("Silence event sent.");
}
v->silence_event_send = 1;
}
}
#endif
}
static MS2_INLINE float compute_gain(Volume *v, float energy, float weight) {
float ret = v->static_gain / (1 + (energy * weight));
return ret;
......@@ -383,6 +414,18 @@ static int volume_remove_dc(MSFilter *f, void *arg){
return 0;
}
static int volume_enable_silence_detection(MSFilter *f, void *arg) {
Volume *v=(Volume*)f->data;
v->silence_detection_enable = *(int*)arg;
return 0;
}
static int volume_set_silence_duration_threshold(MSFilter *f, void *arg) {
Volume *v=(Volume*)f->data;
v->silence_duration = *(unsigned int*)arg;
return 0;
}
static MS2_INLINE int16_t saturate(int val) {
return (val>32767) ? 32767 : ( (val<-32767) ? -32767 : val);
}
......@@ -460,22 +503,25 @@ static void volume_preprocess(MSFilter *f){
Volume *v=(Volume*)f->data;
/*process agc by chunks of 10 ms*/
v->nsamples=(int)(0.01*(float)v->sample_rate);
if (v->agc_enabled){
if (v->agc_enabled) {
ms_message("AGC is enabled.");
}
#if defined HAVE_SPEEXDSP && !defined MS_FIXED_POINT
if (v->speex_pp==NULL){
int tmp=1;
v->speex_pp=speex_preprocess_state_init(v->nsamples,v->sample_rate);
if (speex_preprocess_ctl(v->speex_pp,SPEEX_PREPROCESS_SET_AGC,&tmp)==-1){
ms_warning("Speex AGC is not available.");
}
tmp=0;
speex_preprocess_ctl(v->speex_pp,SPEEX_PREPROCESS_SET_VAD,&tmp);
speex_preprocess_ctl(v->speex_pp,SPEEX_PREPROCESS_SET_DENOISE,&tmp);
speex_preprocess_ctl(v->speex_pp,SPEEX_PREPROCESS_SET_DEREVERB,&tmp);
if (v->speex_pp==NULL && (v->agc_enabled || v->silence_detection_enable)){
int tmp=1;
v->speex_pp=speex_preprocess_state_init(v->nsamples,v->sample_rate);
if (v->agc_enabled && speex_preprocess_ctl(v->speex_pp,SPEEX_PREPROCESS_SET_AGC,&tmp)==-1){
ms_warning("Speex AGC is not available.");
}
#endif
if (v->silence_detection_enable && speex_preprocess_ctl(v->speex_pp,SPEEX_PREPROCESS_SET_VAD,&tmp)==-1) {
ms_warning("Speex VAD is not available.");
}
tmp=0;
speex_preprocess_ctl(v->speex_pp,SPEEX_PREPROCESS_SET_DENOISE,&tmp);
speex_preprocess_ctl(v->speex_pp,SPEEX_PREPROCESS_SET_DEREVERB,&tmp);
}
#endif
ortp_extremum_reset(&v->min);
ortp_extremum_reset(&v->max);
}
......@@ -490,29 +536,28 @@ static void volume_process(MSFilter *f){
* override this target gain, and order must be well thought out
*/
if (v->agc_enabled || v->peer!=NULL){
mblk_t *om;
size_t nbytes=(size_t)(v->nsamples*2);
ms_bufferizer_put_from_queue(v->buffer,f->inputs[0]);
while(ms_bufferizer_get_avail(v->buffer)>=nbytes){
om=allocb(nbytes,0);
ms_bufferizer_read(v->buffer,om->b_wptr,nbytes);
om->b_wptr+=nbytes;
update_energy(v,(int16_t*)om->b_rptr, v->nsamples, f->ticker->time);
m=allocb(nbytes,0);
ms_bufferizer_read(v->buffer,m->b_wptr,nbytes);
m->b_wptr+=nbytes;
update_energy(v,(int16_t*)m->b_rptr, v->nsamples, f->ticker->time);
target_gain = v->static_gain;
if (v->peer) /* this ptr set = echo limiter enable flag */
target_gain = volume_echo_avoider_process(v, om);
target_gain = volume_echo_avoider_process(v, m);
/* Multiply with gain from echo limiter, not "choose smallest". Why?
* Remote talks, local echo suppress via mic path, but still audible in
* remote speaker. AGC operates fully, too (local speaker close to local mic!);
* having agc gain reduction also contribute to total reduction makes sense.
*/
if (v->agc_enabled) target_gain/= volume_agc_process(v, om);
if (v->noise_gate_enabled)
volume_noise_gate_process(v, v->instant_energy, om);
apply_gain(v, om, target_gain);
ms_queue_put(f->outputs[0],om);
if (v->silence_detection_enable) volume_vad_process(f, m);
if (v->agc_enabled) target_gain/= volume_agc_process(f, m);
if (v->noise_gate_enabled) volume_noise_gate_process(v, v->instant_energy, m);
apply_gain(v, m, target_gain);
ms_queue_put(f->outputs[0],m);
}
}else{
/*light processing: no agc. Work in place in the input buffer*/
......@@ -520,8 +565,8 @@ static void volume_process(MSFilter *f){
update_energy(v,(int16_t*)m->b_rptr, (int)((m->b_wptr - m->b_rptr) / 2), f->ticker->time);
target_gain = v->static_gain;
if (v->noise_gate_enabled)
volume_noise_gate_process(v, v->instant_energy, m);
if (v->silence_detection_enable) volume_vad_process(f, m);
if (v->noise_gate_enabled) volume_noise_gate_process(v, v->instant_energy, m);
apply_gain(v, m, target_gain);
ms_queue_put(f->outputs[0],m);
}
......@@ -549,6 +594,8 @@ static MSFilterMethod methods[]={
{ MS_VOLUME_REMOVE_DC, volume_remove_dc },
{ MS_VOLUME_GET_MIN , volume_get_min },
{ MS_VOLUME_GET_MAX , volume_get_max },
{ MS_VOLUME_ENABLE_SILENCE_DETECTION , volume_enable_silence_detection},
{ MS_VOLUME_SET_SILENCE_DURATION_THRESHOLD , volume_set_silence_duration_threshold},
{ 0 , NULL }
};
......
......@@ -48,6 +48,7 @@ set(SOUND_FILES
sounds/sintel_trailer_opus_h264.mkv
sounds/sintel_trailer_opus_vp8.mkv
sounds/sintel_trailer_pcmu_h264.mkv
sounds/test_silence_voice.wav
)
set(SCENARIO_FILES
......
......@@ -24,6 +24,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#include "mediastreamer2/msfilerec.h"
#include "mediastreamer2/msrtp.h"
#include "mediastreamer2/mstonedetector.h"
#include "mediastreamer2/msvolume.h"
#include "mediastreamer2_tester.h"
#include "mediastreamer2_tester_private.h"
#include "private.h"
......@@ -49,6 +50,89 @@ static void tone_detected_cb(void *data, MSFilter *f, unsigned int event_id, MST
ms_tester_tone_detected = TRUE;
}
#if 0 //Remove this test until we found a good implem of VAD
#ifdef HAVE_SPEEXDSP
#define TEST_SILENCE_VOICE_FILE_NAME "sounds/test_silence_voice.wav"
typedef struct struct_silence_callback_data {
int voice_detected_number;
} silence_callback_data;
static void silence_detected_cb(void *data, MSFilter *f, unsigned int event_id, void *arg) {
if (event_id == MS_VOLUME_EVENT_SILENCE_DETECTED) {
silence_callback_data *silence = (silence_callback_data *)data;
silence->voice_detected_number++;
}
}
typedef struct struct_player_callback_data {
int end_of_file;
} player_callback_data;
static void player_cb(void *data, MSFilter *f, unsigned int event_id, void *arg) {
if (event_id == MS_FILE_PLAYER_EOF) {
player_callback_data *player = (player_callback_data *)data;
player->end_of_file = TRUE;
}
}
static void silence_detection(void) {
MSConnectionHelper h;
silence_callback_data silence_data;
player_callback_data player_data;
MSFilter *voice_detector;
unsigned int filter_mask = FILTER_MASK_FILEPLAY | FILTER_MASK_VOIDSINK;
char* recorded_file = bc_tester_res(TEST_SILENCE_VOICE_FILE_NAME);
unsigned int enable_silence = 1;
unsigned int duration_threshold = 1000;
silence_data.voice_detected_number = 0;
player_data.end_of_file = FALSE;
ms_factory_reset_statistics(msFactory);
ms_tester_create_ticker();
ms_tester_create_filters(filter_mask, msFactory);
voice_detector = ms_factory_create_filter(msFactory, MS_VOLUME_ID);
ms_filter_add_notify_callback(voice_detector, silence_detected_cb, &silence_data, TRUE);
ms_filter_add_notify_callback(ms_tester_fileplay, player_cb, &player_data, TRUE);
ms_filter_call_method(ms_tester_fileplay, MS_FILE_PLAYER_OPEN, recorded_file);
ms_filter_call_method_noarg(ms_tester_fileplay, MS_FILE_PLAYER_START);
ms_filter_call_method(voice_detector, MS_VOLUME_ENABLE_SILENCE_DETECTION, (void*)&enable_silence);
ms_filter_call_method(voice_detector, MS_VOLUME_SET_SILENCE_DURATION_THRESHOLD, (void*)&duration_threshold);
ms_connection_helper_start(&h);
ms_connection_helper_link(&h, ms_tester_fileplay, -1, 0);
ms_connection_helper_link(&h, voice_detector, 0, 0);
ms_connection_helper_link(&h, ms_tester_voidsink, 0, -1);
ms_ticker_attach(ms_tester_ticker, ms_tester_fileplay);
BC_ASSERT_TRUE(wait_for_until(NULL, NULL, &player_data.end_of_file, TRUE, 26000));
// TODO Choice better example and check how many silence should be detected
BC_ASSERT_EQUAL(silence_data.voice_detected_number, 2, int, "%d");
ms_filter_call_method_noarg(ms_tester_fileplay, MS_FILE_PLAYER_CLOSE);
ms_ticker_detach(ms_tester_ticker, ms_tester_fileplay);
ms_connection_helper_start(&h);
ms_connection_helper_unlink(&h, ms_tester_fileplay, -1, 0);
ms_connection_helper_unlink(&h, voice_detector, 0, 0);
ms_connection_helper_unlink(&h, ms_tester_voidsink, 0, -1);
ms_factory_log_statistics(msFactory);
if (voice_detector) ms_filter_destroy(voice_detector);
ms_tester_destroy_filters(filter_mask);
ms_tester_destroy_ticker();
ms_free(recorded_file);
}
#endif
#endif //if 0
static void dtmfgen_tonedet(void) {
MSConnectionHelper h;
unsigned int filter_mask = FILTER_MASK_VOIDSOURCE | FILTER_MASK_DTMFGEN | FILTER_MASK_TONEDET | FILTER_MASK_VOIDSINK;
......@@ -400,11 +484,16 @@ static void dtmfgen_filerec_fileplay_tonedet(void) {
ms_tester_destroy_filters(filter_mask);
ms_tester_destroy_ticker();
unlink(recorded_file);
free(recorded_file);
free(recorded_file);
}
test_t basic_audio_tests[] = {
#if 0 //Remove this test until we found a good implem of VAD
#ifdef HAVE_SPEEXDSP
TEST_NO_TAG("silence detection", silence_detection),
#endif
#endif
TEST_NO_TAG("dtmfgen-tonedet", dtmfgen_tonedet),
TEST_NO_TAG("dtmfgen-enc-dec-tonedet-bv16", dtmfgen_enc_dec_tonedet_bv16),
TEST_NO_TAG("dtmfgen-enc-dec-tonedet-pcmu", dtmfgen_enc_dec_tonedet_pcmu),
......
......@@ -56,7 +56,7 @@ static void _decode_qr_code(const char *_image_path, bool_t want_decode, MSRect
char* image_path, *image_res_path;
MSFilter *nowebcam_qrcode = NULL;
MSFilter *zxing_qrcode = NULL;
MSFilter *void_sing = NULL;
MSFilter *void_sink = NULL;
qrcode_callback_data qrcode_cb_data;
MSFactory* _factory = NULL;
......@@ -77,7 +77,7 @@ static void _decode_qr_code(const char *_image_path, bool_t want_decode, MSRect
nowebcam_qrcode = ms_web_cam_create_reader(camera);
ms_filter_notify(nowebcam_qrcode, MS_STATIC_IMAGE_SET_IMAGE, image_res_path);
zxing_qrcode = ms_factory_create_filter(_factory, MS_QRCODE_READER_ID);
void_sing = ms_factory_create_filter(_factory, MS_VOID_SINK_ID);
void_sink = ms_factory_create_filter(_factory, MS_VOID_SINK_ID);
ms_filter_add_notify_callback(zxing_qrcode, (MSFilterNotifyFunc)qrcode_found_cb, &qrcode_cb_data, TRUE);
if (capture_rect) {
MSVideoSize size;
......@@ -90,7 +90,7 @@ static void _decode_qr_code(const char *_image_path, bool_t want_decode, MSRect
ms_connection_helper_start(&h);
ms_connection_helper_link(&h, nowebcam_qrcode, -1, 0);
ms_connection_helper_link(&h, zxing_qrcode, 0, 0);
ms_connection_helper_link(&h, void_sing, 0, -1);
ms_connection_helper_link(&h, void_sink, 0, -1);
ms_ticker_attach(ms_tester_ticker, nowebcam_qrcode);
while(number_of_run-- > 0) {
......@@ -120,14 +120,14 @@ static void _decode_qr_code(const char *_image_path, bool_t want_decode, MSRect
ms_connection_helper_start(&h);
ms_connection_helper_unlink(&h, nowebcam_qrcode, -1, 0);
ms_connection_helper_unlink(&h, zxing_qrcode, 0, 0);
ms_connection_helper_unlink(&h, void_sing, 0, -1);
ms_connection_helper_unlink(&h, void_sink, 0, -1);
ms_factory_log_statistics(_factory);
if (image_path) ms_free(image_path);
if (image_res_path) ms_free(image_res_path);
if (nowebcam_qrcode) ms_filter_destroy(nowebcam_qrcode);
if (zxing_qrcode) ms_filter_destroy(zxing_qrcode);
if (void_sing) ms_filter_destroy(void_sing);
if (void_sink) ms_filter_destroy(void_sink);
if (_factory) ms_factory_destroy(_factory);
ms_tester_destroy_ticker();
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment