diff --git a/AudioExample/AudioExample.js b/AudioExample/AudioExample.js index a65e04d6..a5419e87 100644 --- a/AudioExample/AudioExample.js +++ b/AudioExample/AudioExample.js @@ -20,18 +20,23 @@ class AudioExample extends Component { recording: false, stoppedRecording: false, finished: false, - audioPath: AudioUtils.DocumentDirectoryPath + '/test.aac', + audioPath: AudioUtils.DocumentDirectoryPath + '/test.wav', hasPermission: undefined, }; prepareRecordingPath(audioPath){ - AudioRecorder.prepareRecordingAtPath(audioPath, { - SampleRate: 22050, - Channels: 1, - AudioQuality: "Low", - AudioEncoding: "aac", - AudioEncodingBitRate: 32000 - }); + AudioRecorder.prepareStreamingAtPath(this.state.audioPath, 1600, { + SampleRate: 22050, + Channels: 1, + AudioSource: 'MIC', + // Following is not supported + // AudioQuality: "Low", + // AudioEncoding: "aac", + // AudioEncodingBitRate: 32000, + }, { + Sensitivity: 0, + Timeout: 7000, + }); } componentDidMount() { @@ -41,7 +46,8 @@ class AudioExample extends Component { if (!hasPermission) return; this.prepareRecordingPath(this.state.audioPath); - + console.log(this.state.audioPath); + console.log(AudioRecorder); AudioRecorder.onProgress = (data) => { this.setState({currentTime: Math.floor(data.currentTime)}); }; @@ -52,6 +58,14 @@ class AudioExample extends Component { this._finishRecording(data.status === "OK", data.audioFileURL); } }; + + AudioRecorder.onDataReceived = (data) => { + // console.log(data); + } + + AudioRecorder.onVadReceived = (vadResult) => { + console.log(vadResult); + } }); } @@ -93,7 +107,7 @@ class AudioExample extends Component { this.setState({stoppedRecording: true, recording: false}); try { - const filePath = await AudioRecorder.pauseRecording(); + const filePath = await AudioRecorder.pauseStreaming(); // Pause is currently equivalent to stop on Android. if (Platform.OS === 'android') { @@ -113,7 +127,7 @@ class AudioExample extends Component { this.setState({stoppedRecording: true, recording: false}); try { - const filePath = await AudioRecorder.stopRecording(); + const filePath = await AudioRecorder.stopStreaming(); if (Platform.OS === 'android') { this._finishRecording(true, filePath); @@ -168,7 +182,7 @@ class AudioExample extends Component { this.setState({recording: true}); try { - const filePath = await AudioRecorder.startRecording(); + const filePath = await AudioRecorder.startStreaming(); } catch (error) { console.error(error); } diff --git a/AudioExample/iOS/AudioExample.xcodeproj/project.pbxproj b/AudioExample/iOS/AudioExample.xcodeproj/project.pbxproj index 57c314fb..9ae1682b 100644 --- a/AudioExample/iOS/AudioExample.xcodeproj/project.pbxproj +++ b/AudioExample/iOS/AudioExample.xcodeproj/project.pbxproj @@ -528,8 +528,12 @@ TargetAttributes = { 00E356ED1AD99517003FC87E = { CreatedOnToolsVersion = 6.2; + DevelopmentTeam = SD72YP83U5; TestTargetID = 13B07F861A680F5B00A75B9A; }; + 13B07F861A680F5B00A75B9A = { + DevelopmentTeam = SD72YP83U5; + }; }; }; buildConfigurationList = 83CBB9FA1A601CBA00E9B192 /* Build configuration list for PBXProject "AudioExample" */; @@ -878,6 +882,7 @@ isa = XCBuildConfiguration; buildSettings = { BUNDLE_LOADER = "$(TEST_HOST)"; + DEVELOPMENT_TEAM = SD72YP83U5; GCC_PREPROCESSOR_DEFINITIONS = ( "DEBUG=1", "$(inherited)", @@ -900,6 +905,7 @@ buildSettings = { BUNDLE_LOADER = "$(TEST_HOST)"; COPY_PHASE_STRIP = NO; + DEVELOPMENT_TEAM = SD72YP83U5; INFOPLIST_FILE = AudioExampleTests/Info.plist; IPHONEOS_DEPLOYMENT_TARGET = 8.0; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks @loader_path/Frameworks"; @@ -919,6 +925,7 @@ ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; CURRENT_PROJECT_VERSION = 1; DEAD_CODE_STRIPPING = NO; + DEVELOPMENT_TEAM = SD72YP83U5; INFOPLIST_FILE = AudioExample/Info.plist; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks"; OTHER_LDFLAGS = ( @@ -936,6 +943,7 @@ buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; CURRENT_PROJECT_VERSION = 1; + DEVELOPMENT_TEAM = SD72YP83U5; INFOPLIST_FILE = AudioExample/Info.plist; LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks"; OTHER_LDFLAGS = ( diff --git a/AudioExample/iOS/AudioExample/Images.xcassets/AppIcon.appiconset/Contents.json b/AudioExample/iOS/AudioExample/Images.xcassets/AppIcon.appiconset/Contents.json index 118c98f7..b8236c65 100644 --- a/AudioExample/iOS/AudioExample/Images.xcassets/AppIcon.appiconset/Contents.json +++ b/AudioExample/iOS/AudioExample/Images.xcassets/AppIcon.appiconset/Contents.json @@ -1,5 +1,15 @@ { "images" : [ + { + "idiom" : "iphone", + "size" : "20x20", + "scale" : "2x" + }, + { + "idiom" : "iphone", + "size" : "20x20", + "scale" : "3x" + }, { "idiom" : "iphone", "size" : "29x29", diff --git a/AudioExample/iOS/AudioExample/Info.plist b/AudioExample/iOS/AudioExample/Info.plist index 2fb6a11c..4728718c 100644 --- a/AudioExample/iOS/AudioExample/Info.plist +++ b/AudioExample/iOS/AudioExample/Info.plist @@ -38,6 +38,8 @@ NSLocationWhenInUseUsageDescription + NSMicrophoneUsageDescription + NSAppTransportSecurity diff --git a/android/build.gradle b/android/build.gradle index 22023145..56d21662 100644 --- a/android/build.gradle +++ b/android/build.gradle @@ -9,6 +9,10 @@ android { targetSdkVersion 23 versionCode 1 versionName "1.0" + ndk { + moduleName "witvad" + ldLibs "log" + } } buildTypes { release { @@ -21,4 +25,5 @@ dependencies { compile fileTree(include: ['*.jar'], dir: 'libs') compile 'com.android.support:appcompat-v7:23.1.0' compile 'com.facebook.react:react-native:+' + compile 'com.github.wendykierp:JTransforms:3.0' } diff --git a/android/src/main/java/com/rnim/rn/audio/AudioRecorderManager.java b/android/src/main/java/com/rnim/rn/audio/AudioRecorderManager.java index 33c65d0f..5b266a75 100644 --- a/android/src/main/java/com/rnim/rn/audio/AudioRecorderManager.java +++ b/android/src/main/java/com/rnim/rn/audio/AudioRecorderManager.java @@ -1,7 +1,6 @@ package com.rnim.rn.audio; import android.Manifest; -import android.content.Context; import com.facebook.react.bridge.ReactApplicationContext; import com.facebook.react.bridge.ReactContextBaseJavaModule; @@ -10,25 +9,24 @@ import com.facebook.react.bridge.Arguments; import com.facebook.react.bridge.Promise; import com.facebook.react.bridge.ReadableMap; +import com.facebook.react.bridge.WritableArray; import com.facebook.react.bridge.WritableMap; import java.io.File; -import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Timer; import java.util.TimerTask; import android.content.pm.PackageManager; +import android.media.AudioFormat; +import android.os.AsyncTask; import android.os.Environment; import android.media.MediaRecorder; -import android.media.AudioManager; -import android.support.v4.app.ActivityCompat; import android.support.v4.content.ContextCompat; import android.util.Log; -import com.facebook.react.modules.core.DeviceEventManagerModule; -import java.io.FileInputStream; +import com.facebook.react.modules.core.DeviceEventManagerModule; class AudioRecorderManager extends ReactContextBaseJavaModule { @@ -42,17 +40,19 @@ class AudioRecorderManager extends ReactContextBaseJavaModule { private static final String MusicDirectoryPath = "MusicDirectoryPath"; private static final String DownloadsDirectoryPath = "DownloadsDirectoryPath"; - private Context context; - private MediaRecorder recorder; private String currentOutputFile; private boolean isRecording = false; private Timer timer; private int recorderSecondsElapsed; + // For AudioRecord Class + private RecordWaveTask recordTask = null; public AudioRecorderManager(ReactApplicationContext reactContext) { super(reactContext); - this.context = reactContext; + if (recordTask == null) { + recordTask = new RecordWaveTask(); + } } @Override @@ -81,57 +81,24 @@ public void checkAuthorizationStatus(Promise promise) { promise.resolve(permissionGranted); } - @ReactMethod - public void prepareRecordingAtPath(String recordingPath, ReadableMap recordingSettings, Promise promise) { - if (isRecording){ - logAndRejectPromise(promise, "INVALID_STATE", "Please call stopRecording before starting recording"); - } - - recorder = new MediaRecorder(); - try { - recorder.setAudioSource(MediaRecorder.AudioSource.MIC); - int outputFormat = getOutputFormatFromString(recordingSettings.getString("OutputFormat")); - recorder.setOutputFormat(outputFormat); - int audioEncoder = getAudioEncoderFromString(recordingSettings.getString("AudioEncoding")); - recorder.setAudioEncoder(audioEncoder); - recorder.setAudioSamplingRate(recordingSettings.getInt("SampleRate")); - recorder.setAudioChannels(recordingSettings.getInt("Channels")); - recorder.setAudioEncodingBitRate(recordingSettings.getInt("AudioEncodingBitRate")); - recorder.setOutputFile(recordingPath); - } - catch(final Exception e) { - logAndRejectPromise(promise, "COULDNT_CONFIGURE_MEDIA_RECORDER" , "Make sure you've added RECORD_AUDIO permission to your AndroidManifest.xml file "+e.getMessage()); - return; - } - - currentOutputFile = recordingPath; - try { - recorder.prepare(); - promise.resolve(currentOutputFile); - } catch (final Exception e) { - logAndRejectPromise(promise, "COULDNT_PREPARE_RECORDING_AT_PATH "+recordingPath, e.getMessage()); - } - - } - private int getAudioEncoderFromString(String audioEncoder) { - switch (audioEncoder) { - case "aac": - return MediaRecorder.AudioEncoder.AAC; - case "aac_eld": - return MediaRecorder.AudioEncoder.AAC_ELD; - case "amr_nb": - return MediaRecorder.AudioEncoder.AMR_NB; - case "amr_wb": - return MediaRecorder.AudioEncoder.AMR_WB; - case "he_aac": - return MediaRecorder.AudioEncoder.HE_AAC; - case "vorbis": - return MediaRecorder.AudioEncoder.VORBIS; - default: - Log.d("INVALID_AUDIO_ENCODER", "USING MediaRecorder.AudioEncoder.DEFAULT instead of "+audioEncoder+": "+MediaRecorder.AudioEncoder.DEFAULT); - return MediaRecorder.AudioEncoder.DEFAULT; - } + switch (audioEncoder) { + case "aac": + return MediaRecorder.AudioEncoder.AAC; + case "aac_eld": + return MediaRecorder.AudioEncoder.AAC_ELD; + case "amr_nb": + return MediaRecorder.AudioEncoder.AMR_NB; + case "amr_wb": + return MediaRecorder.AudioEncoder.AMR_WB; + case "he_aac": + return MediaRecorder.AudioEncoder.HE_AAC; + case "vorbis": + return MediaRecorder.AudioEncoder.VORBIS; + default: + Log.d("INVALID_AUDIO_ENCODER", "USING MediaRecorder.AudioEncoder.DEFAULT instead of "+audioEncoder+": "+MediaRecorder.AudioEncoder.DEFAULT); + return MediaRecorder.AudioEncoder.DEFAULT; + } } private int getOutputFormatFromString(String outputFormat) { @@ -156,52 +123,144 @@ private int getOutputFormatFromString(String outputFormat) { } @ReactMethod - public void startRecording(Promise promise){ - if (recorder == null){ - logAndRejectPromise(promise, "RECORDING_NOT_PREPARED", "Please call prepareRecordingAtPath before starting recording"); - return; + public void prepareStreamingAtPath(String recordingPath, int bufferSize, ReadableMap recordingSettings, ReadableMap vadSettings, Promise promise) { + + try { + File wavFile = new File(recordingPath); + recordTask = new RecordWaveTask(); + + if (recordingSettings.hasKey("AudioSource")) { + switch(recordingSettings.getString("AudioSource")) { + case "DEFAULT": + recordTask.setAudioSource(MediaRecorder.AudioSource.DEFAULT); + break; + case "MIC": + recordTask.setAudioSource(MediaRecorder.AudioSource.MIC); + break; + case "VOICE_RECOGNITION": + recordTask.setAudioSource(MediaRecorder.AudioSource.VOICE_RECOGNITION); + break; + default: + recordTask.setAudioSource(MediaRecorder.AudioSource.DEFAULT); + break; + } + } + + if (recordingSettings.hasKey("SampleRate")) { + recordTask.setSampleRate(recordingSettings.getInt("SampleRate")); + } + + if (recordingSettings.hasKey("Channels")) { + int channels = recordingSettings.getInt("Channels"); + int channelMask = AudioFormat.CHANNEL_IN_STEREO; + if (channels == 1) { + channelMask = AudioFormat.CHANNEL_IN_MONO; + } + recordTask.setChannelMask(channelMask); + } + + if (vadSettings.hasKey("Sensitivity")) { + int vadSensitivity = vadSettings.getInt("Sensitivity"); + recordTask.setVadSensitivity(vadSensitivity); + } + + if (vadSettings.hasKey("Timeout")) { + int vadTimeout = vadSettings.getInt("Timeout"); + recordTask.setVadTimeout(vadTimeout); + } + + recordTask.setBufferSize(bufferSize); + + recordTask.setOutputFile(wavFile); + recordTask.setStreamListener(new RecordWaveTask.OnStreamListener() { + + @Override + public void onDataReceived(short[] buffer) { + Log.d("onDataReceived", buffer.length + ""); + WritableArray body = Arguments.createArray(); + for (short value: buffer) { + body.pushInt((int) value); + } + sendEvent("dataReceived", body); + } + }); + + recordTask.setVadListener(new RecordWaveTask.OnVadListener() { + + @Override + public void onVadReceived(int vadResult) { + Log.d("onVadReceived", vadResult + ""); + // WritableMap body = Arguments.createMap(); + // body.putInt("vadResult", vadResult); + sendEvent("vadReceived", vadResult); + } + }); + + // int outputFormat = getOutputFormatFromString(recordingSettings.getString("OutputFormat")); + // recorder.setOutputFormat(outputFormat); + // int audioEncoder = getAudioEncoderFromString(recordingSettings.getString("AudioEncoding")); + // recorder.setAudioEncoder(audioEncoder); + // recorder.setAudioEncodingBitRate(recordingSettings.getInt("AudioEncodingBitRate")); } - if (isRecording){ - logAndRejectPromise(promise, "INVALID_STATE", "Please call stopRecording before starting recording"); + catch(final Exception e) { + logAndRejectPromise(promise, "COULDNT_CONFIGURE_MEDIA_RECORDER" , "Make sure you've added RECORD_AUDIO permission to your AndroidManifest.xml file "+e.getMessage()); return; } - recorder.start(); - isRecording = true; - startTimer(); - promise.resolve(currentOutputFile); + + currentOutputFile = recordingPath; } @ReactMethod - public void stopRecording(Promise promise){ - if (!isRecording){ - logAndRejectPromise(promise, "INVALID_STATE", "Please call startRecording before stopping recording"); + public void startStreaming(Promise promise){ + if (recordTask == null){ + logAndRejectPromise(promise, "STREAMING_NOT_PREPARED", "Please call prepareStreamingAtPath before starting streaming"); return; } - - stopTimer(); - isRecording = false; - - try { - recorder.stop(); - recorder.release(); - } - catch (final RuntimeException e) { - // https://developer.android.com/reference/android/media/MediaRecorder.html#stop() - logAndRejectPromise(promise, "RUNTIME_EXCEPTION", "No valid audio data received. You may be using a device that can't record audio."); - return; - } - finally { - recorder = null; + switch (recordTask.getStatus()) { + case RUNNING: + logAndRejectPromise(promise, "INVALID_STATE", "Please call stopStreaming before starting streaming"); + return; + case FINISHED: + logAndRejectPromise(promise, "STREAMING_NOT_PREPARED", "Please call prepareStreamingAtPath before starting streaming"); + break; + case PENDING: + // No Action } + startTimer(); + recordTask.execute(); + + isRecording = true; promise.resolve(currentOutputFile); - sendEvent("recordingFinished", null); } @ReactMethod - public void pauseRecording(Promise promise){ + public void stopStreaming(final Promise promise){ + Log.d("RecordWaveTask", "stopStreaming"); + if (recordTask != null && !recordTask.isCancelled() && recordTask.getStatus() == AsyncTask.Status.RUNNING) { + Log.d("RecordWaveTask", "stopStreaming2"); + isRecording = false; + recordTask.setCancelCompleteListener(new RecordWaveTask.OnCancelCompleteListener() { + @Override + public void onCancelCompleted() { + Log.d("RecordWaveTask", "onCancelCompleted"); + recordTask = null; + promise.resolve(currentOutputFile); + sendEvent("recordingFinished", null); + } + }); + recordTask.cancel(false); + stopTimer(); + } else { + Log.d("RecordWaveTask", "Task not running."); + logAndRejectPromise(promise, "INVALID_STATE", "Please call startStreaming before stopping streaming"); + } + } + + @ReactMethod + public void pauseStreaming(Promise promise){ // Added this function to have the same api for android and iOS, stops recording now - stopRecording(promise); + stopStreaming(promise); } private void startTimer(){ @@ -226,7 +285,7 @@ private void stopTimer(){ timer = null; } } - + private void sendEvent(String eventName, Object params) { getReactApplicationContext() .getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter.class) diff --git a/android/src/main/java/com/rnim/rn/audio/RecordWaveTask.java b/android/src/main/java/com/rnim/rn/audio/RecordWaveTask.java new file mode 100644 index 00000000..c961481e --- /dev/null +++ b/android/src/main/java/com/rnim/rn/audio/RecordWaveTask.java @@ -0,0 +1,384 @@ +package com.rnim.rn.audio; + +import android.media.AudioFormat; +import android.media.AudioRecord; +import android.media.MediaRecorder; +import android.os.AsyncTask; +import android.os.SystemClock; +import android.util.Log; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.RandomAccessFile; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; + +import java.util.Arrays; +import org.jtransforms.fft.FloatFFT_1D; + +/** + * Created by KDH on 2017. 5. 15.. + */ + +public class RecordWaveTask extends AsyncTask { + + // Default value + private int AUDIO_SOURCE = MediaRecorder.AudioSource.DEFAULT; + private int SAMPLE_RATE = 44100; // Hz + private int ENCODING = AudioFormat.ENCODING_PCM_16BIT; + private int CHANNEL_MASK = AudioFormat.CHANNEL_IN_MONO; + private int BUFFER_SIZE_IN_FRAME = 8192; + private int vadSensitivity = 0; + private int vadTimeout = 7000; + // int BUFFER_SIZE = 2 * AudioRecord.getMinBufferSize(SAMPLE_RATE, CHANNEL_MASK, ENCODING); + + private File outputFile; + + static { + System.loadLibrary("witvad"); + } + + public native int VadInit(int sampleRate, int vadSensitivity, int vadTimeout); + public native int VadStillTalking(short[] samples, float[] fft_mags); + public native int GetVadSamplesPerFrame(); + public native void VadClean(); + + public RecordWaveTask() {} + public void setAudioSource(int audioSource) { this.AUDIO_SOURCE = audioSource; } + + public void setSampleRate(int sampleRate) { this.SAMPLE_RATE = sampleRate; } + + public void setEncoding(int encoding) { this.ENCODING = encoding; } + + public void setChannelMask(int channelMask) { this.CHANNEL_MASK = channelMask; } + + public void setOutputFile(File file) { this.outputFile = file; } + + public void setBufferSize(int bufferSizeInFrame) { this.BUFFER_SIZE_IN_FRAME = bufferSizeInFrame; } + + public void setVadSensitivity(int vadSensitivity) { this.vadSensitivity = vadSensitivity; } + + public void setVadTimeout(int vadTimeout) { this.vadTimeout = vadTimeout; } + + // Step 1 - This interface defines the type of messages I want to communicate to my owner + public interface OnCancelCompleteListener { + public void onCancelCompleted(); + } + private OnCancelCompleteListener cancelCompleteListener = null; + + public void setCancelCompleteListener(OnCancelCompleteListener listener) { + this.cancelCompleteListener = listener; + } + + public interface OnStreamListener { + public void onDataReceived(short[] buffer); + } + private OnStreamListener streamListener = null; + + public void setStreamListener(OnStreamListener listener) { + this.streamListener = listener; + } + + public interface OnVadListener { + public void onVadReceived(int vadResult); + } + private OnVadListener vadListener = null; + + public void setVadListener(OnVadListener listener) { + this.vadListener = listener; + } + + /** + * Opens up the given file, writes the header, and keeps filling it with raw PCM bytes from + * AudioRecord until it reaches 4GB or is stopped by the user. It then goes back and updates + * the WAV header to include the proper final chunk sizes. + * + * @return Either an Exception (error) or two longs, the filesize, elapsed time in ms (success) + */ + @Override + protected Object[] doInBackground(File... unused) { + AudioRecord audioRecord = null; + FileOutputStream wavOut = null; + + long startTime = 0; + long endTime = 0; + + try { + // Open our two resources + int bufferSizeInBytes = BUFFER_SIZE_IN_FRAME * 2; + audioRecord = new AudioRecord(AUDIO_SOURCE, SAMPLE_RATE, CHANNEL_MASK, ENCODING, bufferSizeInBytes); + wavOut = new FileOutputStream(this.outputFile); + + // Write out the wav file header + writeWavHeader(wavOut, CHANNEL_MASK, SAMPLE_RATE, ENCODING); + + // Avoiding loop allocations + short[] buffer = new short[BUFFER_SIZE_IN_FRAME]; + boolean run = true; + int read; + long total = 0; + int vadResult; + + VadInit(SAMPLE_RATE, vadSensitivity, vadTimeout); + + FloatFFT_1D fft = new FloatFFT_1D(GetVadSamplesPerFrame()); + float[] fft_mags = new float[GetVadSamplesPerFrame()/2]; + float[] fft_modules = new float[GetVadSamplesPerFrame()]; + short[] samples; + + // Let's go + startTime = SystemClock.elapsedRealtime(); + audioRecord.startRecording(); + while (run && !isCancelled()) { + read = audioRecord.read(buffer, 0, buffer.length); // Count for 16 bit PCM + + int samplesAnalyzed = 0; + while(samplesAnalyzed + GetVadSamplesPerFrame() < read){ + samples = Arrays.copyOfRange(buffer, samplesAnalyzed, samplesAnalyzed +GetVadSamplesPerFrame()); + for(int i=0; i 4 GB due to the use of 32 bit unsigned integers. + if (total + read > 4294967295L) { + // Write as many bytes as we can before hitting the max size + short[] tmpBuffer = new short[BUFFER_SIZE_IN_FRAME]; + for (int i = 0; i < read && total <= 4294967295L; i++, total+=2) { + ByteBuffer byteBuffer = ByteBuffer.allocate(2); + byteBuffer.putShort(buffer[i]); + wavOut.write(byteBuffer.array()); + tmpBuffer[i] = buffer[i]; + } + if (this.streamListener != null) { + this.streamListener.onDataReceived(tmpBuffer); + } + run = false; + } else if (read >= 0) { + // Short array to byte array + ByteBuffer byteBuffer = ByteBuffer.allocate(buffer.length * 2); + byteBuffer.order(ByteOrder.LITTLE_ENDIAN); + byteBuffer.asShortBuffer().put(buffer); + byte[] bytes = byteBuffer.array(); + + wavOut.write(bytes, 0, read * 2); + + total += (read * 2); // 2 Byte = Short + if (this.streamListener != null) { + Log.d("onDataReceived", "RecordWaveTask - " + read + ""); + this.streamListener.onDataReceived(buffer.clone()); + } + } + } + } catch (IOException ex) { + return new Object[]{ex}; + } finally { + Log.d("RecordWaveTask", "Finally"); + if (audioRecord != null) { + try { + if (audioRecord.getRecordingState() == AudioRecord.RECORDSTATE_RECORDING) { + VadClean(); + audioRecord.stop(); + Log.d("RecordWaveTask", "audioRecord.stop()"); + endTime = SystemClock.elapsedRealtime(); + } + } catch (IllegalStateException ex) { + // + } + if (audioRecord.getState() == AudioRecord.STATE_INITIALIZED) { + audioRecord.release(); + } + } + if (wavOut != null) { + try { + wavOut.close(); + Log.d("RecordWaveTask", "wavOut.close()"); + } catch (IOException ex) { + Log.d("RecordWaveTask", ex.getMessage()); + } + } + } + + try { + // This is not put in the try/catch/finally above since it needs to run + // after we close the FileOutputStream + this.updateWavHeader(this.outputFile); + } catch (IOException ex) { + Log.d("RecordWaveTask", ex.getMessage()); + return new Object[] { ex }; + } + + Log.d("RecordWaveTask", (endTime - startTime) + " sec" ); + Log.d("RecordWaveTask", this.outputFile.length() + " byte" ); + + return new Object[] { this.outputFile.length(), endTime - startTime }; + } + + /** + * Writes the proper 44-byte RIFF/WAVE header to/for the given stream + * Two size fields are left empty/null since we do not yet know the final stream size + * + * @param out The stream to write the header to + * @param channelMask An AudioFormat.CHANNEL_* mask + * @param sampleRate The sample rate in hertz + * @param encoding An AudioFormat.ENCODING_PCM_* value + * @throws IOException + */ + private static void writeWavHeader(OutputStream out, int channelMask, int sampleRate, int encoding) throws IOException { + short channels; + switch (channelMask) { + case AudioFormat.CHANNEL_IN_MONO: + channels = 1; + break; + case AudioFormat.CHANNEL_IN_STEREO: + channels = 2; + break; + default: + throw new IllegalArgumentException("Unacceptable channel mask"); + } + + short bitDepth; + switch (encoding) { + case AudioFormat.ENCODING_PCM_8BIT: + bitDepth = 8; + break; + case AudioFormat.ENCODING_PCM_16BIT: + bitDepth = 16; + break; + case AudioFormat.ENCODING_PCM_FLOAT: + bitDepth = 32; + break; + default: + throw new IllegalArgumentException("Unacceptable encoding"); + } + + writeWavHeader(out, channels, sampleRate, bitDepth); + } + + /** + * Writes the proper 44-byte RIFF/WAVE header to/for the given stream + * Two size fields are left empty/null since we do not yet know the final stream size + * + * @param out The stream to write the header to + * @param channels The number of channels + * @param sampleRate The sample rate in hertz + * @param bitDepth The bit depth + * @throws IOException + */ + private static void writeWavHeader(OutputStream out, short channels, int sampleRate, short bitDepth) throws IOException { + // Convert the multi-byte integers to raw bytes in little endian format as required by the spec + byte[] littleBytes = ByteBuffer + .allocate(14) + .order(ByteOrder.LITTLE_ENDIAN) + .putShort(channels) + .putInt(sampleRate) + .putInt(sampleRate * channels * (bitDepth / 8)) + .putShort((short) (channels * (bitDepth / 8))) + .putShort(bitDepth) + .array(); + + // Not necessarily the best, but it's very easy to visualize this way + out.write(new byte[]{ + // RIFF header + 'R', 'I', 'F', 'F', // ChunkID + 0, 0, 0, 0, // ChunkSize (must be updated later) + 'W', 'A', 'V', 'E', // Format + // fmt subchunk + 'f', 'm', 't', ' ', // Subchunk1ID + 16, 0, 0, 0, // Subchunk1Size + 1, 0, // AudioFormat + littleBytes[0], littleBytes[1], // NumChannels + littleBytes[2], littleBytes[3], littleBytes[4], littleBytes[5], // SampleRate + littleBytes[6], littleBytes[7], littleBytes[8], littleBytes[9], // ByteRate + littleBytes[10], littleBytes[11], // BlockAlign + littleBytes[12], littleBytes[13], // BitsPerSample + // data subchunk + 'd', 'a', 't', 'a', // Subchunk2ID + 0, 0, 0, 0, // Subchunk2Size (must be updated later) + }); + } + + /** + * Updates the given wav file's header to include the final chunk sizes + * + * @param wav The wav file to update + * @throws IOException + */ + private static void updateWavHeader(File wav) throws IOException { + byte[] sizes = ByteBuffer + .allocate(8) + .order(ByteOrder.LITTLE_ENDIAN) + // There are probably a bunch of different/better ways to calculate + // these two given your circumstances. Cast should be safe since if the WAV is + // > 4 GB we've already made a terrible mistake. + .putInt((int) (wav.length() - 8)) // ChunkSize + .putInt((int) (wav.length() - 44)) // Subchunk2Size + .array(); + + RandomAccessFile accessWave = null; + //noinspection CaughtExceptionImmediatelyRethrown + try { + accessWave = new RandomAccessFile(wav, "rw"); + // ChunkSize + accessWave.seek(4); + accessWave.write(sizes, 0, 4); + + // Subchunk2Size + accessWave.seek(40); + accessWave.write(sizes, 4, 4); + } catch (IOException ex) { + // Rethrow but we still close accessWave in our finally + throw ex; + } finally { + if (accessWave != null) { + try { + accessWave.close(); + } catch (IOException ex) { + // + } + } + } + } + + @Override + protected void onCancelled(Object[] results) { + // Handling cancellations and successful runs in the same way + Log.d("RecordWaveTask", "onCancelled"); + onPostExecute(results); + } + + @Override + protected void onPostExecute(Object[] results) { + Log.d("RecordWaveTask", "onPostExecute"); + Throwable throwable = null; + if (results[0] instanceof Throwable) { + // Error + throwable = (Throwable) results[0]; + Log.e(RecordWaveTask.class.getSimpleName(), throwable.getMessage(), throwable); + } + + if (cancelCompleteListener != null) { + cancelCompleteListener.onCancelCompleted(); + } + } +} \ No newline at end of file diff --git a/android/src/main/jni/WITCvad.c b/android/src/main/jni/WITCvad.c new file mode 100644 index 00000000..91eabf30 --- /dev/null +++ b/android/src/main/jni/WITCvad.c @@ -0,0 +1,357 @@ +// +// WITCvad.m +// Wit +// +// Created by Anthony Kesich on 11/12/14. +// Copyright (c) 2014 Willy Blandin. All rights reserved. +// + +#include "WITCvad.h" + + +/* + Adds value to the head of memory + */ +static void frame_memory_push(s_wv_detector_cvad_state *cvad_state, short int value); + +/* + Sums up the last N values of memory + */ +static int frame_memory_sum_last_n(s_wv_detector_cvad_state *cvad_state, int nb); + + +int wvs_cvad_detect_talking(s_wv_detector_cvad_state *cvad_state, short int *samples, float *fft_mags) +{ + double dfc; + double band_energy[DETECTOR_CVAD_N_ENERGY_BANDS]; + double sfm; + int fft_size = pow(2,floor(log2(cvad_state->samples_per_frame))); + short int counter; + int action = -1; + int zero_crossings; + + //only process cvad_state->samples_per_frame samples at a time + //frames_detector_cvad_fft(samples, fft_modules, cvad_state->samples_per_frame); + dfc = frames_detector_cvad_most_dominant_freq(cvad_state, fft_mags, fft_size, cvad_state->samples_per_frame); + sfm = frames_detector_cvad_spectral_flatness(fft_mags, fft_size); + zero_crossings = frames_detector_cvad_zero_crossings(samples, cvad_state->samples_per_frame); + frames_detector_cvad_multiband_energy(cvad_state, fft_mags, fft_size, band_energy, cvad_state->samples_per_frame); + + vw_detector_cvad_set_threshold(cvad_state); + counter = vw_detector_cvad_check_frame(cvad_state, band_energy, dfc, sfm, zero_crossings); + frame_memory_push(cvad_state, counter); + + if ((counter < 3 && cvad_state->talking == 0) || !cvad_state->thresh_initialized) { + cvad_state->silence_count++; + //only update reference levels if we don't detect speech + wv_detector_cvad_update_ref_levels(cvad_state, band_energy, dfc, sfm); + } + if (cvad_state->thresh_initialized) { + int start_sum = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_START); + int stop_sum_long = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG); + int stop_sum_short = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT); + int speech_time = (cvad_state->frame_number-cvad_state->speech_start_frame) * cvad_state->samples_per_frame * 1000 / cvad_state->sample_freq; + + if(start_sum > cvad_state->max_start_sum){ + cvad_state->max_start_sum = start_sum; + } + if (!cvad_state->talking && start_sum >= cvad_state->start_sum_threshold ) { + cvad_state->talking = 1; + cvad_state->speech_start_frame = cvad_state->frame_number; + action = 1; + } + else if (cvad_state->talking && speech_time > DETECTOR_CVAD_MINIMUM_LENGTH + && ((counter < 3 + && stop_sum_long <= cvad_state->max_start_sum*cvad_state->end_sum_long_coeff + && stop_sum_short <= cvad_state->max_start_sum*cvad_state->end_sum_short_coeff) + || (cvad_state->max_speech_time > 0 + && speech_time >= cvad_state->max_speech_time))) { + cvad_state->talking = 0; + action = 0; + cvad_state->max_start_sum = 0; + } + } + + cvad_state->frame_number++; + + return action; +} + +s_wv_detector_cvad_state* wv_detector_cvad_init(int sample_rate, int sensitivity, int speech_timeout) +{ + s_wv_detector_cvad_state *cvad_state = malloc(sizeof(s_wv_detector_cvad_state)); + cvad_state->energy_thresh_coeff_lower = DETECTOR_CVAD_E_TH_COEFF_LOW_BAND; + cvad_state->energy_thresh_coeff_upper = DETECTOR_CVAD_E_TH_COEFF_UPPER_BANDS; + cvad_state->sfm_thresh= DETECTOR_CVAD_SFM_TH; + cvad_state->dfc_thresh= DETECTOR_CVAD_DFC_TH; + cvad_state->min_zero_crossings= DETECTOR_CVAD_MIN_ZERO_CROSSINGS; + cvad_state->max_zero_crossings= DETECTOR_CVAD_MAX_ZERO_CROSSINGS; + memset(cvad_state->energy_update_coeff, 0.20, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double)); + memset(cvad_state->energy_prev_variance, -1, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double)); + memset(cvad_state->energy_history, 0, DETECTOR_CVAD_ENERGY_MEMORY * DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double)); + cvad_state->energy_history_index = 0; + cvad_state->dfc_update_coeff = 0.10; + cvad_state->sfm_update_coeff = 0.10; + cvad_state->frame_number = 0; + cvad_state->speech_start_frame = -1; + cvad_state->max_speech_time = speech_timeout; + cvad_state->thresh_initialized = 0; + cvad_state->silence_count = 0; + cvad_state->talking = 0; + memset(cvad_state->ref_energy, 0, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double)); + cvad_state->ref_dfc = 0; + cvad_state->ref_sfm = 0; + memset(cvad_state->dfc_history, 0, DETECTOR_CVAD_FRAMES_INIT * sizeof(double)); + cvad_state->sample_freq = sample_rate; + cvad_state->max_start_sum = 0; + cvad_state->samples_per_frame = pow(2,ceil(log2(cvad_state->sample_freq/150))); //around 100 frames per second, but must be a power of two + cvad_state->previous_state_index = 0; + memset(cvad_state->previous_state, 0, DETECTOR_CVAD_RESULT_MEMORY * sizeof(short int)); + + wv_detector_cvad_set_sensitivity(cvad_state, sensitivity); + + return cvad_state; +} + +void wv_detector_cvad_clean(s_wv_detector_cvad_state *cvad_state) +{ + free(cvad_state); +} + +void wv_detector_cvad_set_sensitivity(s_wv_detector_cvad_state *cvad_state, int sensitivity) +{ + float sensitivity_frac = fmax(0,fmin(100,sensitivity))/100.0; + cvad_state->n_frames_check_start=DETECTOR_CVAD_N_FRAMES_CHECK_START; + cvad_state->n_frames_check_end_short=DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT; + cvad_state->n_frames_check_end_long=DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG; + + cvad_state->start_sum_threshold = DETECTOR_CVAD_COUNT_SUM_START_SENSITIVE*sensitivity_frac; + cvad_state->start_sum_threshold += DETECTOR_CVAD_COUNT_SUM_START*(1-sensitivity_frac); + + cvad_state->end_sum_short_coeff = DETECTOR_CVAD_COUNT_END_SHORT_FACTOR_SENSITIVE*sensitivity_frac; + cvad_state->end_sum_short_coeff += DETECTOR_CVAD_COUNT_END_SHORT_FACTOR*(1-sensitivity_frac); + + cvad_state->end_sum_long_coeff = DETECTOR_CVAD_COUNT_END_LONG_FACTOR_SENSITIVE*sensitivity_frac; + cvad_state->end_sum_long_coeff += DETECTOR_CVAD_COUNT_END_LONG_FACTOR*(1-sensitivity_frac); +} + +void wv_detector_cvad_update_ref_levels(s_wv_detector_cvad_state *cvad_state, + double *band_energy, + double dfc, + double sfm) +{ + int b=0; + if (!cvad_state->thresh_initialized) { + //if still initializing, accumulate values to average + for(b=0; bref_energy[b] += band_energy[b]; + } + + + cvad_state->ref_sfm += sfm; + + cvad_state->dfc_history[cvad_state->frame_number] = dfc > 0 ? log(dfc) : 0; + } + + //record energy history + for(b=0; benergy_history[b][cvad_state->energy_history_index] = band_energy[b]; + } + cvad_state->energy_history_index++; + cvad_state->energy_history_index%=DETECTOR_CVAD_ENERGY_MEMORY; + + if (cvad_state->frame_number >= DETECTOR_CVAD_FRAMES_INIT) { + if(!cvad_state->thresh_initialized) { + //if done initializing, divide by number of samples to get an average + cvad_state->thresh_initialized = 1; + for(b=0; bref_energy[b] /= cvad_state->frame_number; + } + + cvad_state->ref_sfm /= cvad_state->frame_number; + + double sum = 0; + double sq_sum = 0; + for(b=0; bref_dfc+=cvad_state->dfc_history[b]; + sum += cvad_state->dfc_history[b]; + sq_sum += pow(cvad_state->dfc_history[b],2); + } + cvad_state->ref_dfc /= cvad_state->frame_number; + cvad_state->ref_dfc_var = (sq_sum-sum*sum/cvad_state->frame_number)/(cvad_state->frame_number -1); + + } else if (cvad_state->talking == 0) { + //otherwise update thresholds based on adaptive rules if there's no speech + wv_detector_cvad_modify_update_coeffs(cvad_state); + for(b=0; bref_energy[b] *= (1-cvad_state->energy_update_coeff[b]); + cvad_state->ref_energy[b] += cvad_state->energy_update_coeff[b]*band_energy[b]; + } + + } + } + +} + +void vw_detector_cvad_set_threshold(s_wv_detector_cvad_state *cvad_state) +{ + //update thresholds to be a multiple of the reference level + int b; + cvad_state->th_energy[0] = cvad_state->ref_energy[0]*cvad_state->energy_thresh_coeff_lower; + for(b=1; bth_energy[b] = cvad_state->ref_energy[b]*cvad_state->energy_thresh_coeff_upper; + } + cvad_state->th_dfc = cvad_state->ref_dfc+cvad_state->dfc_thresh; + cvad_state->th_sfm = cvad_state->ref_sfm+cvad_state->sfm_thresh; +} + +void wv_detector_cvad_modify_update_coeffs(s_wv_detector_cvad_state *cvad_state){ + int b; + for(b=0; benergy_history[b][h]; + sq_sum+=pow(cvad_state->energy_history[b][h],2); + } + double variance = (sq_sum-sum*sum/DETECTOR_CVAD_ENERGY_MEMORY)/(DETECTOR_CVAD_ENERGY_MEMORY-1); + double ratio = variance/cvad_state->energy_prev_variance[b]; + if(ratio > 1.25){ + cvad_state->energy_update_coeff[b] = 0.25; + } else if(ratio > 1.10){ + cvad_state->energy_update_coeff[b] = 0.20; + } else if(ratio > 1.00){ + cvad_state->energy_update_coeff[b] = 0.15; + } else if(ratio > 0.00){ + cvad_state->energy_update_coeff[b] = 0.10; + } else { + //negative value indicates that this is the first pass of variance. Just set the coeff to 0.2 + cvad_state->energy_update_coeff[b] = 0.20; + } + cvad_state->energy_prev_variance[b] = variance; + } +} + +short int vw_detector_cvad_check_frame(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm, int zero_crossings) +{ + short int counter; + + counter = 0; + + int band_counter = 0; + if (band_energy[0] > cvad_state->th_energy[0]) { + counter += 2; + } + + int b; + for(b=1; b cvad_state->th_energy[b]){ + band_counter++; + } + } + if(band_counter >= 2){ + counter+=2; + } + + if (fabs((dfc > 0 ? log(dfc): 0) - cvad_state->ref_dfc) > cvad_state->ref_dfc_var) { + counter++; + } + if (sfm > cvad_state->th_sfm) { + counter++; + } + if(zero_crossings >= cvad_state->min_zero_crossings && zero_crossings <= cvad_state->max_zero_crossings){ + counter++; + } + + return counter; +} + + +double frames_detector_cvad_most_dominant_freq(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double nb_samples) +{ + double k = 0.0f; + double max = 0.0f; + double amplitude_minimum = 1.0f; + int i; + + for (i = 0; i < nb_modules; i++) { + if (fft_mags[i] > max && fft_mags[i] > amplitude_minimum) { + max = fft_mags[i]; + k = i; + } + } + + return k * (double)cvad_state->sample_freq / (double)nb_samples; +} + +void frames_detector_cvad_multiband_energy(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double *band_energy, int nb_samples){ + + int b = 0; + int k = 0; + + for(b = 0; bsample_freq/nb_samples < 1000*(b+1)){ + band_energy[b]+=fft_mags[k]; + k++; + } + } + +} + +double frames_detector_cvad_spectral_flatness(float *fft_mags, int nb) +{ + double geo_mean = 0.0f; + double arithm_mean = 0.0f; + double sfm = 0.0f; + int i; + + for (i = 0; i < nb; i++) { + if (fft_mags[i] != 0.0f) { + geo_mean += log(fft_mags[i]); + arithm_mean += fft_mags[i]; + } + } + geo_mean = exp(geo_mean / (double) nb); + arithm_mean = arithm_mean / (double) nb; + sfm = 10 * log10(geo_mean / arithm_mean); + sfm = fabs(sfm); + + return sfm; +} + +int frames_detector_cvad_zero_crossings(short int *samples, int nb){ + int num_zero_crossings = 0; + int i; + + for(i=1; iprevious_state[cvad_state->previous_state_index] = value; + cvad_state->previous_state_index++; + cvad_state->previous_state_index%=DETECTOR_CVAD_RESULT_MEMORY; +} + +static int frame_memory_sum_last_n(s_wv_detector_cvad_state *cvad_state, int nb) +{ + int i = 0; + int sum = 0; + + for (i = 0; i < nb; i++) { + int indx = (cvad_state->previous_state_index - (i+1) + DETECTOR_CVAD_RESULT_MEMORY) % DETECTOR_CVAD_RESULT_MEMORY; + sum += cvad_state->previous_state[indx]; + } + + return sum; +} + diff --git a/android/src/main/jni/WITCvad.h b/android/src/main/jni/WITCvad.h new file mode 100644 index 00000000..48795946 --- /dev/null +++ b/android/src/main/jni/WITCvad.h @@ -0,0 +1,169 @@ +// +// WITCvad.h +// Wit +// +// Created by Anthony Kesich on 11/12/14. +// Copyright (c) 2014 Willy Blandin. All rights reserved. +// + +#ifndef Wit_WITCvad_h +#define Wit_WITCvad_h + + +#include +#include +#include +#include + + +/* + * This speech algorithm looks at multiple auditory compenents related to speech: + * - Energy divided into 1 KHz bands + * - Dominant Frequency Component + * - Spectral Flatness Measure + * - Zero-crossings + * + * If many features of speech are present for a period of time (~150 ms), speech is detected. + * The end of speech is determined by most features of speech disappearing for an extended period of time (~1 sec) + */ + +#define DETECTOR_CVAD_FRAMES_INIT 40 /* number of frames to use to initialize values */ +#define DETECTOR_CVAD_E_TH_COEFF_LOW_BAND 2.5f /* Energy threshold coefficient */ +#define DETECTOR_CVAD_E_TH_COEFF_UPPER_BANDS 2.0f /* Energy threshold coefficient */ +#define DETECTOR_CVAD_SFM_TH 3.0f /* Spectral Flatness Measure threshold */ +#define DETECTOR_CVAD_DFC_TH 250.0f /* most Dominant Frequency Component threshold */ +#define DETECTOR_CVAD_MIN_ZERO_CROSSINGS 5 /* fewest zero crossings for speech */ +#define DETECTOR_CVAD_MAX_ZERO_CROSSINGS 15 /* maximum zero crossings for speech */ +#define DETECTOR_CVAD_RESULT_MEMORY 130 /* number of frame results to keep in memory */ +#define DETECTOR_CVAD_ENERGY_MEMORY 20 /* number of frame results to keep in memory */ +#define DETECTOR_CVAD_N_ENERGY_BANDS 5 /* number of 1 KHz energy bands to compute */ +#define DETECTOR_CVAD_MINIMUM_LENGTH 1000 /* minimum length of vad in ms */ + +//final speech detection variables +#define DETECTOR_CVAD_N_FRAMES_CHECK_START 15 +#define DETECTOR_CVAD_COUNT_SUM_START 4.5*DETECTOR_CVAD_N_FRAMES_CHECK_START +#define DETECTOR_CVAD_COUNT_SUM_START_SENSITIVE 3.8*DETECTOR_CVAD_N_FRAMES_CHECK_START +#define DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT 1.5*DETECTOR_CVAD_N_FRAMES_CHECK_START +#define DETECTOR_CVAD_COUNT_END_SHORT_FACTOR 0.6 +#define DETECTOR_CVAD_COUNT_END_SHORT_FACTOR_SENSITIVE 0.3 +#define DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG 6.5*DETECTOR_CVAD_N_FRAMES_CHECK_START +#define DETECTOR_CVAD_COUNT_END_LONG_FACTOR 1.8 +#define DETECTOR_CVAD_COUNT_END_LONG_FACTOR_SENSITIVE 1.5 + +typedef struct { + double energy_thresh_coeff_lower; + double energy_thresh_coeff_upper; + double sfm_thresh; + double dfc_thresh; + double th_energy[DETECTOR_CVAD_N_ENERGY_BANDS]; + double th_sfm; + double th_dfc; + double ref_energy[DETECTOR_CVAD_N_ENERGY_BANDS]; + double ref_sfm; + double ref_dfc; + double ref_dfc_var; + double energy_update_coeff[DETECTOR_CVAD_N_ENERGY_BANDS]; + double energy_prev_variance[DETECTOR_CVAD_N_ENERGY_BANDS]; + double energy_history[DETECTOR_CVAD_N_ENERGY_BANDS][DETECTOR_CVAD_ENERGY_MEMORY]; + double sfm_update_coeff; + double dfc_history[DETECTOR_CVAD_FRAMES_INIT]; + double dfc_update_coeff; + float end_sum_long_coeff; + float end_sum_short_coeff; + int frame_number; + int speech_start_frame; + int max_speech_time; + int energy_history_index; + int min_zero_crossings; + int max_zero_crossings; + int thresh_initialized; + int silence_count; + int talking; + int sample_freq; + int samples_per_frame; + int max_start_sum; + int n_frames_check_start; + int n_frames_check_end_short; + int n_frames_check_end_long; + int start_sum_threshold; + int previous_state_index; + short int previous_state[DETECTOR_CVAD_RESULT_MEMORY]; +} s_wv_detector_cvad_state; + +/* + Main entry point to the detection algorithm. + This returns a -1 if there is no change in state, a 1 if some started talking, and a 0 if speech ended + */ +int wvs_cvad_detect_talking(s_wv_detector_cvad_state *cvad_state, short int *samples, float *fft_mags); + + +/* + Initiate the cvad_state structure, which represents the state of + one instance of the algorithm + + sensitive mode: 0 if for a close-up mic, 1 if for a fixed, distant mic + */ +s_wv_detector_cvad_state* wv_detector_cvad_init(int sample_rate, int sensitivity, int speech_timeout); + +/* + Safely frees memory for a cvad_state + */ +void wv_detector_cvad_clean(s_wv_detector_cvad_state *cvad_state); + +/* + Set VAD sensitivity (0-100) + - Lower values are for strong voice signals like for a cellphone or personal mic + - Higher values are for use with a fixed-position mic or any application with voice burried in ambient noise + - Defaults to 0 + */ + +void wv_detector_cvad_set_sensitivity(s_wv_detector_cvad_state *cvad_state, int sensitivity); + +/* + Set the reference values of the energy, most dominant frequency componant and the spectral flatness measure. + The threshold value is then set based on the "background" reference levels + */ +void wv_detector_cvad_update_ref_levels(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm); + +/* + Set the threshhold on the cvad_state. + */ +void vw_detector_cvad_set_threshold(s_wv_detector_cvad_state *cvad_state); + +/* + Computes the variance of the energy over the past few windows and adapts the update ceoffs accordingly + */ +void wv_detector_cvad_modify_update_coeffs(s_wv_detector_cvad_state *cvad_state); + +/* + Compare the distance between the value and the minimum value of each component and return how many + component(s) reponded positiviely. + Each frame with more than 2 (out of 3) matching features are qualified as a speech frame. + example : energy - cvad_state->min_energy > cvad_state->th_energy + */ +short int vw_detector_cvad_check_frame(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm, int zero_crossings); + +/* + Return the frequency with the biggest amplitude (from a frame). + */ +double frames_detector_cvad_most_dominant_freq(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double nb_samples); + +/* + Computes the energy of the first DETECTOR_CVAD_N_ENERGY_BANDS 1 KHz bands + */ +void frames_detector_cvad_multiband_energy(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double *band_energy, int nb_samples); + +/* + Compute the spectral flatness of a frame. + It tells us if all the frequencies have a similar amplitude, which would means noise + or if there is some dominant frequencies, which could mean voice. + */ +double frames_detector_cvad_spectral_flatness(float *fft_mags, int nb); + +/* + Counts the number of times the signal crosses zero + Even soft vocalizations have a fairly regular number of zero crossings (~5-15 for 10ms) + */ +int frames_detector_cvad_zero_crossings(short int *samples, int nb); + +#endif diff --git a/android/src/main/jni/WITVadSimple.c b/android/src/main/jni/WITVadSimple.c new file mode 100644 index 00000000..a94d8896 --- /dev/null +++ b/android/src/main/jni/WITVadSimple.c @@ -0,0 +1,206 @@ +// +// WITVadSimple.c +// Wit +// +// Created by Aric Lasry on 8/6/14. +// Copyright (c) 2014 Willy Blandin. All rights reserved. +// + + +#include "WITVadSimple.h" + +/** + * wvs_pcm16short2dbfs - converts short (16 bits) samples to decibel full scale + * @samples: array of pcm 16 bits samples + * @size: numbers of sample + * + * Return a new allocated buffer of double, which will need to be free later + */ +static double * wvs_pcm16short2dbfs(short int *samples, int size); + +static double frames_detector_esf_energy(double *samples, int nb_samples); +static void detector_esf_minimum(wvs_state *state, double energy, int n); +static int detector_esf_check_frame(wvs_state *state, double energy); +static void memory_push(int *memory, int length, int value); +static int frame_memory_lte(int *memory, int value, int nb); +static int frame_memory_gte(int *memory, int value, int nb); +static int wvs_check(wvs_state *state, double *samples, int nb_samples); + + +int wvs_still_talking(wvs_state *state, short int *samples, int nb_samples) +{ + double *dbfss; + double db; + int result; + int i_sample; + + dbfss = wvs_pcm16short2dbfs(samples, nb_samples); + for (i_sample = 0; i_sample < nb_samples; i_sample++) { + db = dbfss[i_sample]; + if (isinf(db)) { + continue; + } + if (state->current_nb_samples == state->samples_per_frame) { + result = wvs_check(state, state->samples, state->current_nb_samples); + if (result == 0 || result == 1) { + free(dbfss); + return result; + } + state->current_nb_samples = 0; + } + state->samples[state->current_nb_samples] = db; + state->current_nb_samples++; + } + free(dbfss); + + return -1; +} + +static int wvs_check(wvs_state *state, double *samples, int nb_samples) +{ + int counter; + double energy; + int action; + char debug_msg[128]; + + action = -1; + energy = frames_detector_esf_energy(samples, nb_samples); + + if (state->sequence <= state->init_frames) { + detector_esf_minimum(state, energy, state->sequence); + } + counter = detector_esf_check_frame(state, energy); + if (state->sequence >= state->init_frames && !counter && !state->talking) { + detector_esf_minimum(state, energy, state->sequence); + } + memory_push(state->previous_state, state->previous_state_maxlen, counter); + if (state->sequence < state->init_frames) { + state->sequence++; + return -1; + } + if (state->talking == 0 && frame_memory_gte(state->previous_state, 1, 10)) { + state->talking = 1; + __android_log_write(ANDROID_LOG_DEBUG, "WitVAD", "Speak start"); + action = 1; + } + else if (state->talking == 1 && frame_memory_lte(state->previous_state, 0, state->previous_state_maxlen)) { + state->talking = 0; + action = 0; + __android_log_write(ANDROID_LOG_DEBUG, "WitVAD", "Speak end"); + } + state->sequence++; + + return action; +} + +wvs_state *wvs_init(double threshold, int sample_rate) +{ + wvs_state *state; + + state = malloc(sizeof(*state)); + state->sequence = 0; + state->min_initialized = 0; + state->init_frames = 30; + state->energy_threshold = threshold; + state->previous_state_maxlen = 50; + state->previous_state = malloc(sizeof(*state->previous_state) * state->previous_state_maxlen); + state->talking = 0; + state->sample_rate = sample_rate; + state->samples_per_frame = state->sample_rate / 100; + state->samples = malloc(sizeof(*state->samples) * state->samples_per_frame); + state->current_nb_samples = 0; + state->min_energy = 0.0; + + return state; +} + +void wvs_clean(wvs_state *state) +{ + free(state->samples); + free(state->previous_state); + free(state); +} + +static double * wvs_pcm16short2dbfs(short int *samples, int size) +{ + double *dbfss; + double max_ref; + int i; + + max_ref = 32768; //pow(2.0, 16.0) / 2; signed 16 bits w/o the -1 + dbfss = malloc(sizeof(*dbfss) * size); + + for (i = 0; i < size; i++) { + dbfss[i] = 0 - 20 * log10(fabs(samples[i] / max_ref)); + } + + return dbfss; +} + +static double frames_detector_esf_energy(double *samples, int nb_samples) +{ + double energy = 0.0f; + int i; + + for (i = 0; i < nb_samples; i++) { + energy += samples[i]; + } + energy /= nb_samples; + + return energy; +} + +static void detector_esf_minimum(wvs_state *state, double energy, int n) +{ + n = (n > 10) ? 10 : n; //this correspond to 1/10 of a second + state->min_energy = (state->min_energy * n + energy) / (n + 1); + state->min_initialized = 1; +} + +static int detector_esf_check_frame(wvs_state *state, double energy) +{ + int counter; + + counter = 0; + char debug_msg[200]; + + if ((0 - (energy - state->min_energy)) >= state->energy_threshold) { + counter++; + } + + return counter; +} + +static void memory_push(int *memory, int length, int value) +{ + while (--length) { + memory[length] = memory[length - 1]; + } + memory[0] = value; +} + +static int frame_memory_gte(int *memory, int value, int nb) +{ + int i = 0; + + for (i = 0; i < nb; i++) { + if (memory[i] < value) { + return 0; + } + } + + return 1; +} + +static int frame_memory_lte(int *memory, int value, int nb) +{ + int i; + + for (i = 0; i < nb; i++) { + if (memory[i] > value) { + return 0; + } + } + + return 1; +} diff --git a/android/src/main/jni/WITVadSimple.h b/android/src/main/jni/WITVadSimple.h new file mode 100644 index 00000000..7c1a9802 --- /dev/null +++ b/android/src/main/jni/WITVadSimple.h @@ -0,0 +1,73 @@ +// +// WITVadSimple.h +// Wit +// +// Created by Aric Lasry on 8/6/14. +// Copyright (c) 2014 Willy Blandin. All rights reserved. +// + +#ifndef Wit_WITVadSimple_h +#define Wit_WITVadSimple_h + +#include +#include +#include +#include +#include + + + +/** + * This voice activity detection is very simple. It computes the average of the + * audio powers from the beginning and the last second, and compare the distance + * between the two with a pre-defined threshold. + * + * The "audio powers" are average of audio chunks in DBFS. It could also be PCM samples... + */ + +/* + state of the voice activity detection algorithm. + */ +typedef struct { + /* frame number */ + int sequence; + + /* is the environment initialized? */ + int min_initialized; + + /* frame number needed for initialization */ + int init_frames; + + double energy_threshold; + + double min_energy; + + int *previous_state; + + int previous_state_maxlen; + + int talking; + + /* number of sample per second */ + int sample_rate; + + /* number of samples needed to calculate the feature(s) */ + int samples_per_frame; + + /* samples list to send to the checking function when enough are available */ + double *samples; + + int current_nb_samples; +} wvs_state; + +int wvs_still_talking(wvs_state *state, short int *samples, int nb_samples); + +wvs_state *wvs_init(double threshold, int sample_rate); + +/** + * wvs_clean - clean a wvs_state* structure + * @state: the structure to free. + */ +void wvs_clean(wvs_state *state); + +#endif diff --git a/android/src/main/jni/WITVadWrapper.c b/android/src/main/jni/WITVadWrapper.c new file mode 100644 index 00000000..197b1bb2 --- /dev/null +++ b/android/src/main/jni/WITVadWrapper.c @@ -0,0 +1,55 @@ +#include "WITCvad.h" +#include + + +static s_wv_detector_cvad_state* wit_vad_g_struct = 0; + +int Java_com_rnim_rn_audio_RecordWaveTask_VadInit(JNIEnv *env, jobject obj, jint sample_rate, jint vadSensitivity, jint vadTimeout) +{ + vadSensitivity = (int)fmax(0,fmin(100,vadSensitivity)); //bounds-checking + wit_vad_g_struct = wv_detector_cvad_init(sample_rate, (int)vadSensitivity, (int)vadTimeout); + + return 0; +} + + +int Java_com_rnim_rn_audio_RecordWaveTask_VadStillTalking(JNIEnv *env, jobject obj, jshortArray java_arr, jfloatArray java_fft_arr) +{ + short int *samples; + float *fft_mags; + int i, sum = 0; + int result; + jshort *native_arr = (*env)->GetShortArrayElements(env, java_arr, NULL); + jfloat *native_fft_arr = (*env)->GetFloatArrayElements(env, java_fft_arr, NULL); + int arr_len = wit_vad_g_struct->samples_per_frame; + + samples = malloc(sizeof(*samples) * arr_len); + for (i = 0; i < arr_len; i++) { + samples[i] = native_arr[i]; + } + (*env)->ReleaseShortArrayElements(env, java_arr, native_arr, 0); + + fft_mags = malloc(sizeof(*fft_mags) * arr_len); + for (i = 0; i < arr_len/2; i++) { + fft_mags[i] = native_fft_arr[i]; + } + (*env)->ReleaseFloatArrayElements(env, java_fft_arr, native_fft_arr, 0); + + result = wvs_cvad_detect_talking(wit_vad_g_struct, samples, fft_mags); + free(samples); + free(fft_mags); + + return result; +} + +void Java_com_rnim_rn_audio_RecordWaveTask_VadClean() +{ + if (wit_vad_g_struct) { + wv_detector_cvad_clean(wit_vad_g_struct); + wit_vad_g_struct = 0; + } +} + +int Java_com_rnim_rn_audio_RecordWaveTask_GetVadSamplesPerFrame(){ + return wit_vad_g_struct->samples_per_frame; +} diff --git a/index.js b/index.js index 450ed839..b3cfe295 100644 --- a/index.js +++ b/index.js @@ -12,7 +12,7 @@ import ReactNative, { var AudioRecorderManager = NativeModules.AudioRecorderManager; var AudioRecorder = { - prepareRecordingAtPath: function(path, options) { + prepareStreamingAtPath: function(path, bufferSize=8192, options, vadOptions) { if (this.progressSubscription) this.progressSubscription.remove(); this.progressSubscription = NativeAppEventEmitter.addListener('recordingProgress', (data) => { @@ -31,45 +31,76 @@ var AudioRecorder = { } ); + if (this.dataReceivedSubscription) this.dataReceivedSubscription.remove(); + this.dataReceivedSubscription = NativeAppEventEmitter.addListener('dataReceived', + (data) => { + if (this.onDataReceived) { + this.onDataReceived(data); + } + } + ); + + if (this.vadReceivedSubscription) this.vadReceivedSubscription.remove(); + this.vadReceivedSubscription = NativeAppEventEmitter.addListener('vadReceived', + (vadResult) => { + if (this.onVadReceived) { + this.onVadReceived(vadResult); + } + } + ); + var defaultOptions = { SampleRate: 44100.0, - Channels: 2, + Channels: 1, AudioQuality: 'High', AudioEncoding: 'ima4', - OutputFormat: 'mpeg_4', MeteringEnabled: false, - AudioEncodingBitRate: 32000 + AudioSource: 'DEFAULT', + // OutputFormat: 'mpeg_4', + // AudioEncodingBitRate: 32000 }; var recordingOptions = {...defaultOptions, ...options}; + var defaultVadOptions = { + Sensitivity: 0, + Timeout: 7000, + } + + var vadOptions = {...defaultVadOptions, ...vadOptions}; + if (Platform.OS === 'ios') { - AudioRecorderManager.prepareRecordingAtPath( + AudioRecorderManager.prepareStreamingAtPath( path, + bufferSize, recordingOptions.SampleRate, recordingOptions.Channels, recordingOptions.AudioQuality, recordingOptions.AudioEncoding, - recordingOptions.MeteringEnabled + recordingOptions.MeteringEnabled, + vadOptions.Sensitivity, + vadOptions.Timeout, ); } else { - return AudioRecorderManager.prepareRecordingAtPath(path, recordingOptions); + return AudioRecorderManager.prepareStreamingAtPath(path, bufferSize, recordingOptions, vadOptions); } }, - startRecording: function() { - return AudioRecorderManager.startRecording(); + startStreaming: function() { + return AudioRecorderManager.startStreaming(); }, - pauseRecording: function() { - return AudioRecorderManager.pauseRecording(); + stopStreaming: function() { + return AudioRecorderManager.stopStreaming(); }, - stopRecording: function() { - return AudioRecorderManager.stopRecording(); + pauseStreaming: function() { + return AudioRecorderManager.pauseStreaming(); }, checkAuthorizationStatus: AudioRecorderManager.checkAuthorizationStatus, requestAuthorization: AudioRecorderManager.requestAuthorization, removeListeners: function() { if (this.progressSubscription) this.progressSubscription.remove(); if (this.finishedSubscription) this.finishedSubscription.remove(); + if (this.dataReceivedSubscription) this.dataReceivedSubscription.remove(); + if (this.vadReceivedSubscription) this.vadReceivedSubscription.remove(); }, }; diff --git a/ios/AudioRecorderManager.h b/ios/AudioRecorderManager.h index d117e923..743e8e0c 100644 --- a/ios/AudioRecorderManager.h +++ b/ios/AudioRecorderManager.h @@ -9,7 +9,8 @@ #import #import #import +#import "WITVad.h" -@interface AudioRecorderManager : NSObject +@interface AudioRecorderManager : NSObject -@end \ No newline at end of file +@end diff --git a/ios/AudioRecorderManager.m b/ios/AudioRecorderManager.m index c8e9bbb8..355cb192 100644 --- a/ios/AudioRecorderManager.m +++ b/ios/AudioRecorderManager.m @@ -12,13 +12,17 @@ #import #import #import +#import "StreamingModule.h" +#import "WITVad.h" NSString *const AudioRecorderEventProgress = @"recordingProgress"; NSString *const AudioRecorderEventFinished = @"recordingFinished"; +NSString *const AudioRecorderEventDataReceived = @"dataReceived"; +NSString *const AudioRecorderEventVadReceived = @"vadReceived"; @implementation AudioRecorderManager { - AVAudioRecorder *_audioRecorder; + // AVAudioRecorder *_audioRecorder; NSTimeInterval _currentTime; id _progressUpdateTimer; @@ -29,17 +33,22 @@ @implementation AudioRecorderManager { NSNumber *_audioEncoding; NSNumber *_audioChannels; NSNumber *_audioSampleRate; - AVAudioSession *_recordSession; BOOL _meteringEnabled; + int _bufferSize; + int _vadSensitivity; + int _vadTimeout; } +StreamingModule* streamingModule; +WITVad *vad; + @synthesize bridge = _bridge; RCT_EXPORT_MODULE(); - (void)sendProgressUpdate { - if (_audioRecorder && _audioRecorder.recording) { - _currentTime = _audioRecorder.currentTime; + if (streamingModule && streamingModule->recording) { + _currentTime = streamingModule->currentTime; } else { return; } @@ -48,11 +57,13 @@ - (void)sendProgressUpdate { (([_prevProgressUpdateTime timeIntervalSinceNow] * -1000.0) >= _progressUpdateInterval)) { NSMutableDictionary *body = [[NSMutableDictionary alloc] init]; [body setObject:[NSNumber numberWithFloat:_currentTime] forKey:@"currentTime"]; + /* if (_meteringEnabled) { [_audioRecorder updateMeters]; float _currentMetering = [_audioRecorder averagePowerForChannel: 0]; [body setObject:[NSNumber numberWithFloat:_currentMetering] forKey:@"currentMetering"]; } + */ [self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventProgress body:body]; @@ -74,10 +85,10 @@ - (void)startProgressTimer { [_progressUpdateTimer addToRunLoop:[NSRunLoop mainRunLoop] forMode:NSDefaultRunLoopMode]; } -- (void)audioRecorderDidFinishRecording:(AVAudioRecorder *)recorder successfully:(BOOL)flag { - [self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventFinished body:@{ - @"status": flag ? @"OK" : @"ERROR", - @"audioFileURL": [_audioFileURL absoluteString] +- (void)finishRecording:(BOOL)flag { + [self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventFinished body:@{ + @"status": flag ? @"OK" : @"ERROR", + @"audioFileURL": [_audioFileURL absoluteString] }]; } @@ -88,124 +99,6 @@ - (NSString *) applicationDocumentsDirectory return basePath; } -RCT_EXPORT_METHOD(prepareRecordingAtPath:(NSString *)path sampleRate:(float)sampleRate channels:(nonnull NSNumber *)channels quality:(NSString *)quality encoding:(NSString *)encoding meteringEnabled:(BOOL)meteringEnabled) -{ - _prevProgressUpdateTime = nil; - [self stopProgressTimer]; - - _audioFileURL = [NSURL fileURLWithPath:path]; - - // Default options - _audioQuality = [NSNumber numberWithInt:AVAudioQualityHigh]; - _audioEncoding = [NSNumber numberWithInt:kAudioFormatAppleIMA4]; - _audioChannels = [NSNumber numberWithInt:2]; - _audioSampleRate = [NSNumber numberWithFloat:44100.0]; - _meteringEnabled = NO; - - // Set audio quality from options - if (quality != nil) { - if ([quality isEqual: @"Low"]) { - _audioQuality =[NSNumber numberWithInt:AVAudioQualityLow]; - } else if ([quality isEqual: @"Medium"]) { - _audioQuality =[NSNumber numberWithInt:AVAudioQualityMedium]; - } else if ([quality isEqual: @"High"]) { - _audioQuality =[NSNumber numberWithInt:AVAudioQualityHigh]; - } - } - - // Set channels from options - if (channels != nil) { - _audioChannels = channels; - } - - // Set audio encoding from options - if (encoding != nil) { - if ([encoding isEqual: @"lpcm"]) { - _audioEncoding =[NSNumber numberWithInt:kAudioFormatLinearPCM]; - } else if ([encoding isEqual: @"ima4"]) { - _audioEncoding =[NSNumber numberWithInt:kAudioFormatAppleIMA4]; - } else if ([encoding isEqual: @"aac"]) { - _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEG4AAC]; - } else if ([encoding isEqual: @"MAC3"]) { - _audioEncoding =[NSNumber numberWithInt:kAudioFormatMACE3]; - } else if ([encoding isEqual: @"MAC6"]) { - _audioEncoding =[NSNumber numberWithInt:kAudioFormatMACE6]; - } else if ([encoding isEqual: @"ulaw"]) { - _audioEncoding =[NSNumber numberWithInt:kAudioFormatULaw]; - } else if ([encoding isEqual: @"alaw"]) { - _audioEncoding =[NSNumber numberWithInt:kAudioFormatALaw]; - } else if ([encoding isEqual: @"mp1"]) { - _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEGLayer1]; - } else if ([encoding isEqual: @"mp2"]) { - _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEGLayer2]; - } else if ([encoding isEqual: @"alac"]) { - _audioEncoding =[NSNumber numberWithInt:kAudioFormatAppleLossless]; - } else if ([encoding isEqual: @"amr"]) { - _audioEncoding =[NSNumber numberWithInt:kAudioFormatAMR]; - } - } - - // Set sample rate from options - _audioSampleRate = [NSNumber numberWithFloat:sampleRate]; - - NSDictionary *recordSettings = [NSDictionary dictionaryWithObjectsAndKeys: - _audioQuality, AVEncoderAudioQualityKey, - _audioEncoding, AVFormatIDKey, - _audioChannels, AVNumberOfChannelsKey, - _audioSampleRate, AVSampleRateKey, - nil]; - - // Enable metering from options - if (meteringEnabled != NO) { - _meteringEnabled = meteringEnabled; - } - - NSError *error = nil; - - _recordSession = [AVAudioSession sharedInstance]; - [_recordSession setCategory:AVAudioSessionCategoryMultiRoute error:nil]; - - _audioRecorder = [[AVAudioRecorder alloc] - initWithURL:_audioFileURL - settings:recordSettings - error:&error]; - - _audioRecorder.meteringEnabled = _meteringEnabled; - _audioRecorder.delegate = self; - - if (error) { - NSLog(@"error: %@", [error localizedDescription]); - // TODO: dispatch error over the bridge - } else { - [_audioRecorder prepareToRecord]; - } -} - -RCT_EXPORT_METHOD(startRecording) -{ - if (!_audioRecorder.recording) { - [self startProgressTimer]; - [_recordSession setActive:YES error:nil]; - [_audioRecorder record]; - - } -} - -RCT_EXPORT_METHOD(stopRecording) -{ - [_audioRecorder stop]; - [_recordSession setActive:NO error:nil]; - _prevProgressUpdateTime = nil; -} - -RCT_EXPORT_METHOD(pauseRecording) -{ - if (_audioRecorder.recording) { - [self stopProgressTimer]; - [_audioRecorder pause]; - } -} - RCT_EXPORT_METHOD(checkAuthorizationStatus:(RCTPromiseResolveBlock)resolve reject:(__unused RCTPromiseRejectBlock)reject) { AVAudioSessionRecordPermission permissionStatus = [[AVAudioSession sharedInstance] recordPermission]; @@ -237,6 +130,146 @@ - (NSString *) applicationDocumentsDirectory }]; } +RCT_EXPORT_METHOD(prepareStreamingAtPath:(NSString *)path bufferSize:(int)bufferSize sampleRate:(float)sampleRate channels:(nonnull NSNumber *)channels quality:(NSString *)quality encoding:(NSString *)encoding meteringEnabled:(BOOL)meteringEnabled vadSensitivity:(int)vadSensitivity vadTimeout:(int)vadTimeout) +{ + NSLog(@"prepareStreaming"); + _audioFileURL = [NSURL fileURLWithPath:path]; + + // Default options + _audioQuality = [NSNumber numberWithInt:AVAudioQualityHigh]; + _audioEncoding = [NSNumber numberWithInt:kAudioFormatAppleIMA4]; + _audioChannels = [NSNumber numberWithInt:1]; + _audioSampleRate = [NSNumber numberWithFloat:44100.0]; + _meteringEnabled = NO; + _bufferSize = 8192; + _vadSensitivity = 0; + _vadTimeout = 7000; + + // Set audio quality from options + if (quality != nil) { + if ([quality isEqual: @"Low"]) { + _audioQuality =[NSNumber numberWithInt:AVAudioQualityLow]; + } else if ([quality isEqual: @"Medium"]) { + _audioQuality =[NSNumber numberWithInt:AVAudioQualityMedium]; + } else if ([quality isEqual: @"High"]) { + _audioQuality =[NSNumber numberWithInt:AVAudioQualityHigh]; + } + } + + // Set channels from options + if (channels != nil) { + _audioChannels = channels; + } + + // Set audio encoding from options + if (encoding != nil) { + if ([encoding isEqual: @"lpcm"]) { + _audioEncoding =[NSNumber numberWithInt:kAudioFormatLinearPCM]; + } else if ([encoding isEqual: @"ima4"]) { + _audioEncoding =[NSNumber numberWithInt:kAudioFormatAppleIMA4]; + } else if ([encoding isEqual: @"aac"]) { + _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEG4AAC]; + } else if ([encoding isEqual: @"MAC3"]) { + _audioEncoding =[NSNumber numberWithInt:kAudioFormatMACE3]; + } else if ([encoding isEqual: @"MAC6"]) { + _audioEncoding =[NSNumber numberWithInt:kAudioFormatMACE6]; + } else if ([encoding isEqual: @"ulaw"]) { + _audioEncoding =[NSNumber numberWithInt:kAudioFormatULaw]; + } else if ([encoding isEqual: @"alaw"]) { + _audioEncoding =[NSNumber numberWithInt:kAudioFormatALaw]; + } else if ([encoding isEqual: @"mp1"]) { + _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEGLayer1]; + } else if ([encoding isEqual: @"mp2"]) { + _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEGLayer2]; + } else if ([encoding isEqual: @"alac"]) { + _audioEncoding =[NSNumber numberWithInt:kAudioFormatAppleLossless]; + } else if ([encoding isEqual: @"amr"]) { + _audioEncoding =[NSNumber numberWithInt:kAudioFormatAMR]; + } + } + + // Set sample rate from options + _audioSampleRate = [NSNumber numberWithFloat:sampleRate]; + + // Set buffer size from options + _bufferSize = bufferSize; + + NSDictionary *recordSettings = [NSDictionary dictionaryWithObjectsAndKeys: + //_audioQuality, AVEncoderAudioQualityKey, + //_audioEncoding, AVFormatIDKey, + _audioChannels, AVNumberOfChannelsKey, + _audioSampleRate, AVSampleRateKey, + nil]; + + // Enable metering from options + if (meteringEnabled != NO) { + _meteringEnabled = meteringEnabled; + } + + _vadSensitivity = vadSensitivity; + _vadTimeout = vadTimeout; + + if (vad == nil) { + vad = [[WITVad alloc] initWithAudioSampleRate:[_audioSampleRate intValue] + vadSensitivity:_vadSensitivity + vadTimeout:_vadTimeout]; + vad.delegate = self; + } + + streamingModule = [[StreamingModule alloc] init]; + [streamingModule prepare:_audioFileURL + bufferSize:_bufferSize + settings:recordSettings + handler:^(AVAudioPCMBuffer *buf){ + NSMutableArray *body = [[NSMutableArray alloc] init]; + for(int i=0; imBytesPerFrame; + NSData *audio = [[NSData alloc] initWithBytes:int16ChannelData length:length]; + [vad gotAudioSamples:audio]; + } + [self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventDataReceived body:body]; + } + ]; +} + +RCT_EXPORT_METHOD(startStreaming) +{ + NSLog(@"startStreaming"); + NSLog(@"%@", _audioFileURL); + [self startProgressTimer]; + [[AVAudioSession sharedInstance] setActive:YES error:nil]; + [streamingModule start]; +} + +RCT_EXPORT_METHOD(stopStreaming) +{ + NSLog(@"stopStreaming"); + [streamingModule stop]; + [[AVAudioSession sharedInstance] setActive:NO error:nil]; + _prevProgressUpdateTime = nil; + if (vad) { + vad.delegate = nil; + vad = nil; + } + [self finishRecording: true]; +} + +RCT_EXPORT_METHOD(pauseStreaming) +{ + NSLog(@"pauseStreaming"); + [self stopProgressTimer]; + [streamingModule pause]; +} + + - (NSString *)getPathForDirectory:(int)directory { NSArray *paths = NSSearchPathForDirectoriesInDomains(directory, NSUserDomainMask, YES); @@ -253,4 +286,14 @@ - (NSDictionary *)constantsToExport }; } +-(void)vadStartedTalking { + NSLog(@"Started Talking"); + [self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventVadReceived body:[NSNumber numberWithInt:1]]; +} + +-(void)vadStoppedTalking { + NSLog(@"Stopped Talking"); + [self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventVadReceived body:[NSNumber numberWithInt:0]]; +} + @end diff --git a/ios/RNAudio.xcodeproj/project.pbxproj b/ios/RNAudio.xcodeproj/project.pbxproj index 5e1326f0..d5178132 100644 --- a/ios/RNAudio.xcodeproj/project.pbxproj +++ b/ios/RNAudio.xcodeproj/project.pbxproj @@ -7,7 +7,11 @@ objects = { /* Begin PBXBuildFile section */ + 38D7625B1EDD3F58007B8DE3 /* StreamingModule.m in Sources */ = {isa = PBXBuildFile; fileRef = 38D762591EDD3F58007B8DE3 /* StreamingModule.m */; }; 429D457A1CFC96E100CBD51A /* AudioRecorderManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 429D45761CFC96E100CBD51A /* AudioRecorderManager.m */; }; + 7664CAD31F39482200FC59DE /* WITCvad.m in Sources */ = {isa = PBXBuildFile; fileRef = 7664CAD21F39482200FC59DE /* WITCvad.m */; }; + 7664CAD61F394C8100FC59DE /* WITVad.m in Sources */ = {isa = PBXBuildFile; fileRef = 7664CAD51F394C8100FC59DE /* WITVad.m */; }; + 76A04C0C1EDD91B800516515 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 76A04C0B1EDD91B800516515 /* AVFoundation.framework */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -23,9 +27,16 @@ /* End PBXCopyFilesBuildPhase section */ /* Begin PBXFileReference section */ + 38D762591EDD3F58007B8DE3 /* StreamingModule.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = StreamingModule.m; sourceTree = SOURCE_ROOT; }; + 38D7625A1EDD3F58007B8DE3 /* StreamingModule.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = StreamingModule.h; sourceTree = SOURCE_ROOT; }; 429D45761CFC96E100CBD51A /* AudioRecorderManager.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = AudioRecorderManager.m; sourceTree = SOURCE_ROOT; }; 429D45771CFC96E100CBD51A /* AudioRecorderManager.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AudioRecorderManager.h; sourceTree = SOURCE_ROOT; }; 42F559BA1CFC90C400DC3F84 /* libRNAudio.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libRNAudio.a; sourceTree = BUILT_PRODUCTS_DIR; }; + 7664CAD11F39482200FC59DE /* WITCvad.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WITCvad.h; sourceTree = SOURCE_ROOT; }; + 7664CAD21F39482200FC59DE /* WITCvad.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = WITCvad.m; sourceTree = SOURCE_ROOT; }; + 7664CAD41F394C8100FC59DE /* WITVad.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WITVad.h; sourceTree = SOURCE_ROOT; }; + 7664CAD51F394C8100FC59DE /* WITVad.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = WITVad.m; sourceTree = SOURCE_ROOT; }; + 76A04C0B1EDD91B800516515 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -33,6 +44,7 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( + 76A04C0C1EDD91B800516515 /* AVFoundation.framework in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -44,6 +56,7 @@ children = ( 42F559BC1CFC90C400DC3F84 /* RNAudio */, 42F559BB1CFC90C400DC3F84 /* Products */, + 76A04C0A1EDD91B700516515 /* Frameworks */, ); sourceTree = ""; }; @@ -58,12 +71,26 @@ 42F559BC1CFC90C400DC3F84 /* RNAudio */ = { isa = PBXGroup; children = ( + 7664CAD41F394C8100FC59DE /* WITVad.h */, + 7664CAD51F394C8100FC59DE /* WITVad.m */, + 7664CAD11F39482200FC59DE /* WITCvad.h */, + 7664CAD21F39482200FC59DE /* WITCvad.m */, 429D45761CFC96E100CBD51A /* AudioRecorderManager.m */, 429D45771CFC96E100CBD51A /* AudioRecorderManager.h */, + 38D762591EDD3F58007B8DE3 /* StreamingModule.m */, + 38D7625A1EDD3F58007B8DE3 /* StreamingModule.h */, ); path = RNAudio; sourceTree = ""; }; + 76A04C0A1EDD91B700516515 /* Frameworks */ = { + isa = PBXGroup; + children = ( + 76A04C0B1EDD91B800516515 /* AVFoundation.framework */, + ); + name = Frameworks; + sourceTree = ""; + }; /* End PBXGroup section */ /* Begin PBXNativeTarget section */ @@ -120,6 +147,9 @@ isa = PBXSourcesBuildPhase; buildActionMask = 2147483647; files = ( + 38D7625B1EDD3F58007B8DE3 /* StreamingModule.m in Sources */, + 7664CAD31F39482200FC59DE /* WITCvad.m in Sources */, + 7664CAD61F394C8100FC59DE /* WITVad.m in Sources */, 429D457A1CFC96E100CBD51A /* AudioRecorderManager.m in Sources */, ); runOnlyForDeploymentPostprocessing = 0; diff --git a/ios/StreamingModule.h b/ios/StreamingModule.h new file mode 100644 index 00000000..c3e38948 --- /dev/null +++ b/ios/StreamingModule.h @@ -0,0 +1,31 @@ +// +// StreamingModule.h +// RNAudio +// +// Created by JeungminOh on 30/05/2017. +// Copyright © 2017 Joshua Sierles. All rights reserved. +// + +#import + +@interface StreamingModule : NSObject +{ + AVAudioEngine *_engine; + void (^_audioDataReceived)(AVAudioPCMBuffer *buf); + NSURL *_fileUrl; + NSDictionary *_settings; + AVAudioMixerNode *_downMixer; + NSTimeInterval _startTime; + int _bufferSize; + + @public + bool recording; + NSTimeInterval currentTime; +} + +- (void)prepare:(NSURL*)recordingFileUrl bufferSize:(int)bufferSize settings:(NSDictionary*)settings handler:(void(^)(AVAudioPCMBuffer *))handler; +- (void)start; +- (void)pause; +- (void)stop; + +@end diff --git a/ios/StreamingModule.m b/ios/StreamingModule.m new file mode 100644 index 00000000..9a524175 --- /dev/null +++ b/ios/StreamingModule.m @@ -0,0 +1,125 @@ +// +// StreamingModule.c +// RNAudio +// +// Created by JeungminOh on 30/05/2017. +// Copyright © 2017 Joshua Sierles. All rights reserved. +// + +#import "StreamingModule.h" + +@implementation StreamingModule + +- (void)prepare:(NSURL *)recordingFileUrl bufferSize:(int)bufferSize settings:(NSDictionary*)settings handler:(void(^)(AVAudioPCMBuffer *))handler { + _audioDataReceived = [handler copy]; + _fileUrl = recordingFileUrl; + _settings = settings; + _bufferSize = bufferSize; + + _engine = [[AVAudioEngine alloc] init]; + + AVAudioInputNode *input = [_engine inputNode]; + _downMixer = [[AVAudioMixerNode alloc] init]; + AVAudioMixerNode *mainMixer = [_engine mainMixerNode]; + + NSLog(@"Prepare"); + NSLog(@"%@", [settings description]); + + + AVAudioFormat *pcmFloat32Format = + [[AVAudioFormat alloc] initWithCommonFormat: AVAudioPCMFormatFloat32 + sampleRate: [_settings[AVSampleRateKey] doubleValue] + channels: [_settings[AVNumberOfChannelsKey] intValue] + interleaved: NO + ]; + + AVAudioFormat *pcmInt16Format = + [[AVAudioFormat alloc] initWithCommonFormat: AVAudioPCMFormatInt16 + sampleRate: [_settings[AVSampleRateKey] doubleValue] + channels: [_settings[AVNumberOfChannelsKey] intValue] + interleaved: NO + ]; + + NSLog(@"%@", [pcmFloat32Format description]); + + [_engine attachNode:_downMixer]; + [_engine connect:input to:_downMixer format:[input inputFormatForBus:0]]; + [_downMixer setVolume:0]; + [_engine connect:_downMixer to:mainMixer format:pcmFloat32Format]; + + NSError *error = nil; + AVAudioFile *file = [[AVAudioFile alloc] initForWriting:_fileUrl + settings:_settings + commonFormat:AVAudioPCMFormatInt16 + interleaved:NO + error:&error]; + + NSLog(@"InstallTapOnBus"); + + [_downMixer installTapOnBus: 0 bufferSize: _bufferSize format: pcmFloat32Format block: ^(AVAudioPCMBuffer *buf, AVAudioTime *when) { + // ‘buf' contains audio captured from input node at time 'when' + currentTime = when.sampleTime / when.sampleRate - _startTime; + + // convert AVAudioPCMFormatFloat32 to AVAudioPCMFormatInt16 + AVAudioPCMBuffer *pcmInt16Buffer = [[AVAudioPCMBuffer alloc] initWithPCMFormat:pcmInt16Format + frameCapacity:[buf frameCapacity]]; + + [pcmInt16Buffer setFrameLength: [buf frameLength]]; + + for (int channel=0; channel +#include +#include +#include + + +/* + * This speech algorithm looks at multiple auditory compenents related to speech: + * - Energy divided into 1 KHz bands + * - Dominant Frequency Component + * - Spectral Flatness Measure + * - Zero-crossings + * + * If many features of speech are present for a period of time (~150 ms), speech is detected. + * The end of speech is determined by most features of speech disappearing for an extended period of time (~1 sec) + */ + +#define DETECTOR_CVAD_FRAMES_INIT 40 /* number of frames to use to initialize values */ +#define DETECTOR_CVAD_E_TH_COEFF_LOW_BAND 2.5f /* Energy threshold coefficient */ +#define DETECTOR_CVAD_E_TH_COEFF_UPPER_BANDS 2.0f /* Energy threshold coefficient */ +#define DETECTOR_CVAD_SFM_TH 3.0f /* Spectral Flatness Measure threshold */ +#define DETECTOR_CVAD_DFC_TH 250.0f /* most Dominant Frequency Component threshold */ +#define DETECTOR_CVAD_MIN_ZERO_CROSSINGS 5 /* fewest zero crossings for speech */ +#define DETECTOR_CVAD_MAX_ZERO_CROSSINGS 15 /* maximum zero crossings for speech */ +#define DETECTOR_CVAD_RESULT_MEMORY 130 /* number of frame results to keep in memory */ +#define DETECTOR_CVAD_ENERGY_MEMORY 20 /* number of frame results to keep in memory */ +#define DETECTOR_CVAD_N_ENERGY_BANDS 5 /* number of 1 KHz energy bands to compute */ +#define DETECTOR_CVAD_MINIMUM_LENGTH 1000 /* minimum length of vad in ms */ + +//final speech detection variables +#define DETECTOR_CVAD_N_FRAMES_CHECK_START 15 +#define DETECTOR_CVAD_COUNT_SUM_START 4.5*DETECTOR_CVAD_N_FRAMES_CHECK_START +#define DETECTOR_CVAD_COUNT_SUM_START_SENSITIVE 3.8*DETECTOR_CVAD_N_FRAMES_CHECK_START +#define DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT 1.5*DETECTOR_CVAD_N_FRAMES_CHECK_START +#define DETECTOR_CVAD_COUNT_END_SHORT_FACTOR 0.6 +#define DETECTOR_CVAD_COUNT_END_SHORT_FACTOR_SENSITIVE 0.3 +#define DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG 6.5*DETECTOR_CVAD_N_FRAMES_CHECK_START +#define DETECTOR_CVAD_COUNT_END_LONG_FACTOR 1.8 +#define DETECTOR_CVAD_COUNT_END_LONG_FACTOR_SENSITIVE 1.5 + +typedef struct { + double energy_thresh_coeff_lower; + double energy_thresh_coeff_upper; + double sfm_thresh; + double dfc_thresh; + double th_energy[DETECTOR_CVAD_N_ENERGY_BANDS]; + double th_sfm; + double th_dfc; + double ref_energy[DETECTOR_CVAD_N_ENERGY_BANDS]; + double ref_sfm; + double ref_dfc; + double ref_dfc_var; + double energy_update_coeff[DETECTOR_CVAD_N_ENERGY_BANDS]; + double energy_prev_variance[DETECTOR_CVAD_N_ENERGY_BANDS]; + double energy_history[DETECTOR_CVAD_N_ENERGY_BANDS][DETECTOR_CVAD_ENERGY_MEMORY]; + double sfm_update_coeff; + double dfc_history[DETECTOR_CVAD_FRAMES_INIT]; + double dfc_update_coeff; + float end_sum_long_coeff; + float end_sum_short_coeff; + int frame_number; + int speech_start_frame; + int max_speech_time; + int energy_history_index; + int min_zero_crossings; + int max_zero_crossings; + int thresh_initialized; + int silence_count; + int talking; + int sample_freq; + int samples_per_frame; + int max_start_sum; + int n_frames_check_start; + int n_frames_check_end_short; + int n_frames_check_end_long; + int start_sum_threshold; + int previous_state_index; + short int previous_state[DETECTOR_CVAD_RESULT_MEMORY]; +} s_wv_detector_cvad_state; + +/* + Main entry point to the detection algorithm. + This returns a -1 if there is no change in state, a 1 if some started talking, and a 0 if speech ended + */ +int wvs_cvad_detect_talking(s_wv_detector_cvad_state *cvad_state, short int *samples, float *fft_mags); + + +/* + Initiate the cvad_state structure, which represents the state of + one instance of the algorithm + + sensitive mode: 0 if for a close-up mic, 1 if for a fixed, distant mic + */ +s_wv_detector_cvad_state* wv_detector_cvad_init(int sample_rate, int sensitivity, int speech_timeout); + +/* + Safely frees memory for a cvad_state + */ +void wv_detector_cvad_clean(s_wv_detector_cvad_state *cvad_state); + +/* + Set VAD sensitivity (0-100) + - Lower values are for strong voice signals like for a cellphone or personal mic + - Higher values are for use with a fixed-position mic or any application with voice burried in ambient noise + - Defaults to 0 + */ + +void wv_detector_cvad_set_sensitivity(s_wv_detector_cvad_state *cvad_state, int sensitivity); + +/* + Set the reference values of the energy, most dominant frequency componant and the spectral flatness measure. + The threshold value is then set based on the "background" reference levels + */ +void wv_detector_cvad_update_ref_levels(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm); + +/* + Set the threshhold on the cvad_state. + */ +void vw_detector_cvad_set_threshold(s_wv_detector_cvad_state *cvad_state); + +/* + Computes the variance of the energy over the past few windows and adapts the update ceoffs accordingly + */ +void wv_detector_cvad_modify_update_coeffs(s_wv_detector_cvad_state *cvad_state); + +/* + Compare the distance between the value and the minimum value of each component and return how many + component(s) reponded positiviely. + Each frame with more than 2 (out of 3) matching features are qualified as a speech frame. + example : energy - cvad_state->min_energy > cvad_state->th_energy + */ +short int vw_detector_cvad_check_frame(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm, int zero_crossings); + +/* + Return the frequency with the biggest amplitude (from a frame). + */ +double frames_detector_cvad_most_dominant_freq(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double nb_samples); + +/* + Computes the energy of the first DETECTOR_CVAD_N_ENERGY_BANDS 1 KHz bands + */ +void frames_detector_cvad_multiband_energy(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double *band_energy, int nb_samples); + +/* + Compute the spectral flatness of a frame. + It tells us if all the frequencies have a similar amplitude, which would means noise + or if there is some dominant frequencies, which could mean voice. + */ +double frames_detector_cvad_spectral_flatness(float *fft_mags, int nb); + +/* + Counts the number of times the signal crosses zero + Even soft vocalizations have a fairly regular number of zero crossings (~5-15 for 10ms) + */ +int frames_detector_cvad_zero_crossings(short int *samples, int nb); + +#endif diff --git a/ios/WITCvad.m b/ios/WITCvad.m new file mode 100644 index 00000000..91eabf30 --- /dev/null +++ b/ios/WITCvad.m @@ -0,0 +1,357 @@ +// +// WITCvad.m +// Wit +// +// Created by Anthony Kesich on 11/12/14. +// Copyright (c) 2014 Willy Blandin. All rights reserved. +// + +#include "WITCvad.h" + + +/* + Adds value to the head of memory + */ +static void frame_memory_push(s_wv_detector_cvad_state *cvad_state, short int value); + +/* + Sums up the last N values of memory + */ +static int frame_memory_sum_last_n(s_wv_detector_cvad_state *cvad_state, int nb); + + +int wvs_cvad_detect_talking(s_wv_detector_cvad_state *cvad_state, short int *samples, float *fft_mags) +{ + double dfc; + double band_energy[DETECTOR_CVAD_N_ENERGY_BANDS]; + double sfm; + int fft_size = pow(2,floor(log2(cvad_state->samples_per_frame))); + short int counter; + int action = -1; + int zero_crossings; + + //only process cvad_state->samples_per_frame samples at a time + //frames_detector_cvad_fft(samples, fft_modules, cvad_state->samples_per_frame); + dfc = frames_detector_cvad_most_dominant_freq(cvad_state, fft_mags, fft_size, cvad_state->samples_per_frame); + sfm = frames_detector_cvad_spectral_flatness(fft_mags, fft_size); + zero_crossings = frames_detector_cvad_zero_crossings(samples, cvad_state->samples_per_frame); + frames_detector_cvad_multiband_energy(cvad_state, fft_mags, fft_size, band_energy, cvad_state->samples_per_frame); + + vw_detector_cvad_set_threshold(cvad_state); + counter = vw_detector_cvad_check_frame(cvad_state, band_energy, dfc, sfm, zero_crossings); + frame_memory_push(cvad_state, counter); + + if ((counter < 3 && cvad_state->talking == 0) || !cvad_state->thresh_initialized) { + cvad_state->silence_count++; + //only update reference levels if we don't detect speech + wv_detector_cvad_update_ref_levels(cvad_state, band_energy, dfc, sfm); + } + if (cvad_state->thresh_initialized) { + int start_sum = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_START); + int stop_sum_long = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG); + int stop_sum_short = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT); + int speech_time = (cvad_state->frame_number-cvad_state->speech_start_frame) * cvad_state->samples_per_frame * 1000 / cvad_state->sample_freq; + + if(start_sum > cvad_state->max_start_sum){ + cvad_state->max_start_sum = start_sum; + } + if (!cvad_state->talking && start_sum >= cvad_state->start_sum_threshold ) { + cvad_state->talking = 1; + cvad_state->speech_start_frame = cvad_state->frame_number; + action = 1; + } + else if (cvad_state->talking && speech_time > DETECTOR_CVAD_MINIMUM_LENGTH + && ((counter < 3 + && stop_sum_long <= cvad_state->max_start_sum*cvad_state->end_sum_long_coeff + && stop_sum_short <= cvad_state->max_start_sum*cvad_state->end_sum_short_coeff) + || (cvad_state->max_speech_time > 0 + && speech_time >= cvad_state->max_speech_time))) { + cvad_state->talking = 0; + action = 0; + cvad_state->max_start_sum = 0; + } + } + + cvad_state->frame_number++; + + return action; +} + +s_wv_detector_cvad_state* wv_detector_cvad_init(int sample_rate, int sensitivity, int speech_timeout) +{ + s_wv_detector_cvad_state *cvad_state = malloc(sizeof(s_wv_detector_cvad_state)); + cvad_state->energy_thresh_coeff_lower = DETECTOR_CVAD_E_TH_COEFF_LOW_BAND; + cvad_state->energy_thresh_coeff_upper = DETECTOR_CVAD_E_TH_COEFF_UPPER_BANDS; + cvad_state->sfm_thresh= DETECTOR_CVAD_SFM_TH; + cvad_state->dfc_thresh= DETECTOR_CVAD_DFC_TH; + cvad_state->min_zero_crossings= DETECTOR_CVAD_MIN_ZERO_CROSSINGS; + cvad_state->max_zero_crossings= DETECTOR_CVAD_MAX_ZERO_CROSSINGS; + memset(cvad_state->energy_update_coeff, 0.20, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double)); + memset(cvad_state->energy_prev_variance, -1, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double)); + memset(cvad_state->energy_history, 0, DETECTOR_CVAD_ENERGY_MEMORY * DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double)); + cvad_state->energy_history_index = 0; + cvad_state->dfc_update_coeff = 0.10; + cvad_state->sfm_update_coeff = 0.10; + cvad_state->frame_number = 0; + cvad_state->speech_start_frame = -1; + cvad_state->max_speech_time = speech_timeout; + cvad_state->thresh_initialized = 0; + cvad_state->silence_count = 0; + cvad_state->talking = 0; + memset(cvad_state->ref_energy, 0, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double)); + cvad_state->ref_dfc = 0; + cvad_state->ref_sfm = 0; + memset(cvad_state->dfc_history, 0, DETECTOR_CVAD_FRAMES_INIT * sizeof(double)); + cvad_state->sample_freq = sample_rate; + cvad_state->max_start_sum = 0; + cvad_state->samples_per_frame = pow(2,ceil(log2(cvad_state->sample_freq/150))); //around 100 frames per second, but must be a power of two + cvad_state->previous_state_index = 0; + memset(cvad_state->previous_state, 0, DETECTOR_CVAD_RESULT_MEMORY * sizeof(short int)); + + wv_detector_cvad_set_sensitivity(cvad_state, sensitivity); + + return cvad_state; +} + +void wv_detector_cvad_clean(s_wv_detector_cvad_state *cvad_state) +{ + free(cvad_state); +} + +void wv_detector_cvad_set_sensitivity(s_wv_detector_cvad_state *cvad_state, int sensitivity) +{ + float sensitivity_frac = fmax(0,fmin(100,sensitivity))/100.0; + cvad_state->n_frames_check_start=DETECTOR_CVAD_N_FRAMES_CHECK_START; + cvad_state->n_frames_check_end_short=DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT; + cvad_state->n_frames_check_end_long=DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG; + + cvad_state->start_sum_threshold = DETECTOR_CVAD_COUNT_SUM_START_SENSITIVE*sensitivity_frac; + cvad_state->start_sum_threshold += DETECTOR_CVAD_COUNT_SUM_START*(1-sensitivity_frac); + + cvad_state->end_sum_short_coeff = DETECTOR_CVAD_COUNT_END_SHORT_FACTOR_SENSITIVE*sensitivity_frac; + cvad_state->end_sum_short_coeff += DETECTOR_CVAD_COUNT_END_SHORT_FACTOR*(1-sensitivity_frac); + + cvad_state->end_sum_long_coeff = DETECTOR_CVAD_COUNT_END_LONG_FACTOR_SENSITIVE*sensitivity_frac; + cvad_state->end_sum_long_coeff += DETECTOR_CVAD_COUNT_END_LONG_FACTOR*(1-sensitivity_frac); +} + +void wv_detector_cvad_update_ref_levels(s_wv_detector_cvad_state *cvad_state, + double *band_energy, + double dfc, + double sfm) +{ + int b=0; + if (!cvad_state->thresh_initialized) { + //if still initializing, accumulate values to average + for(b=0; bref_energy[b] += band_energy[b]; + } + + + cvad_state->ref_sfm += sfm; + + cvad_state->dfc_history[cvad_state->frame_number] = dfc > 0 ? log(dfc) : 0; + } + + //record energy history + for(b=0; benergy_history[b][cvad_state->energy_history_index] = band_energy[b]; + } + cvad_state->energy_history_index++; + cvad_state->energy_history_index%=DETECTOR_CVAD_ENERGY_MEMORY; + + if (cvad_state->frame_number >= DETECTOR_CVAD_FRAMES_INIT) { + if(!cvad_state->thresh_initialized) { + //if done initializing, divide by number of samples to get an average + cvad_state->thresh_initialized = 1; + for(b=0; bref_energy[b] /= cvad_state->frame_number; + } + + cvad_state->ref_sfm /= cvad_state->frame_number; + + double sum = 0; + double sq_sum = 0; + for(b=0; bref_dfc+=cvad_state->dfc_history[b]; + sum += cvad_state->dfc_history[b]; + sq_sum += pow(cvad_state->dfc_history[b],2); + } + cvad_state->ref_dfc /= cvad_state->frame_number; + cvad_state->ref_dfc_var = (sq_sum-sum*sum/cvad_state->frame_number)/(cvad_state->frame_number -1); + + } else if (cvad_state->talking == 0) { + //otherwise update thresholds based on adaptive rules if there's no speech + wv_detector_cvad_modify_update_coeffs(cvad_state); + for(b=0; bref_energy[b] *= (1-cvad_state->energy_update_coeff[b]); + cvad_state->ref_energy[b] += cvad_state->energy_update_coeff[b]*band_energy[b]; + } + + } + } + +} + +void vw_detector_cvad_set_threshold(s_wv_detector_cvad_state *cvad_state) +{ + //update thresholds to be a multiple of the reference level + int b; + cvad_state->th_energy[0] = cvad_state->ref_energy[0]*cvad_state->energy_thresh_coeff_lower; + for(b=1; bth_energy[b] = cvad_state->ref_energy[b]*cvad_state->energy_thresh_coeff_upper; + } + cvad_state->th_dfc = cvad_state->ref_dfc+cvad_state->dfc_thresh; + cvad_state->th_sfm = cvad_state->ref_sfm+cvad_state->sfm_thresh; +} + +void wv_detector_cvad_modify_update_coeffs(s_wv_detector_cvad_state *cvad_state){ + int b; + for(b=0; benergy_history[b][h]; + sq_sum+=pow(cvad_state->energy_history[b][h],2); + } + double variance = (sq_sum-sum*sum/DETECTOR_CVAD_ENERGY_MEMORY)/(DETECTOR_CVAD_ENERGY_MEMORY-1); + double ratio = variance/cvad_state->energy_prev_variance[b]; + if(ratio > 1.25){ + cvad_state->energy_update_coeff[b] = 0.25; + } else if(ratio > 1.10){ + cvad_state->energy_update_coeff[b] = 0.20; + } else if(ratio > 1.00){ + cvad_state->energy_update_coeff[b] = 0.15; + } else if(ratio > 0.00){ + cvad_state->energy_update_coeff[b] = 0.10; + } else { + //negative value indicates that this is the first pass of variance. Just set the coeff to 0.2 + cvad_state->energy_update_coeff[b] = 0.20; + } + cvad_state->energy_prev_variance[b] = variance; + } +} + +short int vw_detector_cvad_check_frame(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm, int zero_crossings) +{ + short int counter; + + counter = 0; + + int band_counter = 0; + if (band_energy[0] > cvad_state->th_energy[0]) { + counter += 2; + } + + int b; + for(b=1; b cvad_state->th_energy[b]){ + band_counter++; + } + } + if(band_counter >= 2){ + counter+=2; + } + + if (fabs((dfc > 0 ? log(dfc): 0) - cvad_state->ref_dfc) > cvad_state->ref_dfc_var) { + counter++; + } + if (sfm > cvad_state->th_sfm) { + counter++; + } + if(zero_crossings >= cvad_state->min_zero_crossings && zero_crossings <= cvad_state->max_zero_crossings){ + counter++; + } + + return counter; +} + + +double frames_detector_cvad_most_dominant_freq(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double nb_samples) +{ + double k = 0.0f; + double max = 0.0f; + double amplitude_minimum = 1.0f; + int i; + + for (i = 0; i < nb_modules; i++) { + if (fft_mags[i] > max && fft_mags[i] > amplitude_minimum) { + max = fft_mags[i]; + k = i; + } + } + + return k * (double)cvad_state->sample_freq / (double)nb_samples; +} + +void frames_detector_cvad_multiband_energy(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double *band_energy, int nb_samples){ + + int b = 0; + int k = 0; + + for(b = 0; bsample_freq/nb_samples < 1000*(b+1)){ + band_energy[b]+=fft_mags[k]; + k++; + } + } + +} + +double frames_detector_cvad_spectral_flatness(float *fft_mags, int nb) +{ + double geo_mean = 0.0f; + double arithm_mean = 0.0f; + double sfm = 0.0f; + int i; + + for (i = 0; i < nb; i++) { + if (fft_mags[i] != 0.0f) { + geo_mean += log(fft_mags[i]); + arithm_mean += fft_mags[i]; + } + } + geo_mean = exp(geo_mean / (double) nb); + arithm_mean = arithm_mean / (double) nb; + sfm = 10 * log10(geo_mean / arithm_mean); + sfm = fabs(sfm); + + return sfm; +} + +int frames_detector_cvad_zero_crossings(short int *samples, int nb){ + int num_zero_crossings = 0; + int i; + + for(i=1; iprevious_state[cvad_state->previous_state_index] = value; + cvad_state->previous_state_index++; + cvad_state->previous_state_index%=DETECTOR_CVAD_RESULT_MEMORY; +} + +static int frame_memory_sum_last_n(s_wv_detector_cvad_state *cvad_state, int nb) +{ + int i = 0; + int sum = 0; + + for (i = 0; i < nb; i++) { + int indx = (cvad_state->previous_state_index - (i+1) + DETECTOR_CVAD_RESULT_MEMORY) % DETECTOR_CVAD_RESULT_MEMORY; + sum += cvad_state->previous_state[indx]; + } + + return sum; +} + diff --git a/ios/WITVad.h b/ios/WITVad.h new file mode 100644 index 00000000..c0c90610 --- /dev/null +++ b/ios/WITVad.h @@ -0,0 +1,32 @@ +// +// WITVad.h +// Wit +// +// Created by Aric Lasry on 8/6/14. +// Copyright (c) 2014 Willy Blandin. All rights reserved. +// + +#import +#import +#import "WITCvad.h" + +@protocol WITVadDelegate; + +@interface WITVad : NSObject + +@property (nonatomic, weak) id delegate; + +@property (nonatomic, assign) BOOL stoppedUsingVad; + +- (instancetype)initWithAudioSampleRate:(int)audioSampleRate vadSensitivity:(int)_vadSensitivity vadTimeout:(int)_vadTimeout; +- (void)gotAudioSamples:(NSData *)samples; + +@end + + +@protocol WITVadDelegate + +-(void) vadStartedTalking; +-(void) vadStoppedTalking; + +@end diff --git a/ios/WITVad.m b/ios/WITVad.m new file mode 100644 index 00000000..d587bea2 --- /dev/null +++ b/ios/WITVad.m @@ -0,0 +1,123 @@ +// +// WITVad.m +// Wit +// +// Created by Aric Lasry on 8/6/14. +// Copyright (c) 2014 Willy Blandin. All rights reserved. +// + +#import "WITVad.h" + +@implementation WITVad { + s_wv_detector_cvad_state *vad_state; + FFTSetup fft_setup; +} + +- (void)gotAudioSamples:(NSData *)samples { + UInt32 size = (UInt32)[samples length]; + short *bytes = (short*)[samples bytes]; + + for(int sample_offset=0; sample_offset+self->vad_state->samples_per_frame < size/2; sample_offset+=self->vad_state->samples_per_frame){ + + int nonZero=0; + + //check to make sure buffer actually has audio data + for(int i=0; ivad_state->samples_per_frame; i++){ + if(bytes[sample_offset+i] != 0){ + nonZero=1; + break; + } + } + + //skip frame if it has nothing + if(!nonZero) continue; + + float *fft_mags = [self get_fft:(bytes+sample_offset)]; + + int detected_speech = wvs_cvad_detect_talking(self->vad_state, bytes+sample_offset, fft_mags); + + free(fft_mags); + + if ( detected_speech == 1){ + //someone just started talking + NSLog(@"start talking..."); + // debug(@"Starting......................") + dispatch_async(dispatch_get_main_queue(), ^{ + [self.delegate vadStartedTalking]; + }); + } else if ( detected_speech == 0) { + //someone just stopped talking + NSLog(@"stop talking..."); + // debug(@"Stopping......................"); + self.stoppedUsingVad = YES; + dispatch_async(dispatch_get_main_queue(), ^{ + [self.delegate vadStoppedTalking]; + }); + break; + } + } + +} + +- (instancetype)initWithAudioSampleRate:(int)audioSampleRate vadSensitivity:(int)_vadSensitivity vadTimeout:(int)_vadTimeout { + // debug(@"WITVad init"); + self = [super init]; + if (!self) { + return nil; + } + int vadSensitivity = (int)fmin(100,fmax(0,_vadSensitivity)); //must be between 0 and 100 + int vadTimeout = (int)_vadTimeout; + + self->vad_state = wv_detector_cvad_init(audioSampleRate,vadSensitivity,vadTimeout); + self.stoppedUsingVad = NO; + + //get the next power of 2 that'll fit our data + int logN = log2(self->vad_state->samples_per_frame); //samples_per_frame will be a power of 2 + //store the FFT setup for many later uses + self->fft_setup = vDSP_create_fftsetup(logN, kFFTRadix2); + + return self; +} + +- (void)dealloc { + // debug(@"Clean WITVad"); + wv_detector_cvad_clean(self->vad_state); +} + +- (float*)get_fft:(short *)samples { + int N = self->vad_state->samples_per_frame; //guarenteed to be a power of 2 + + //dynamically allocate an array for our results since we don't want to mutate the input samples + float *fft_mags = malloc(N/2 * sizeof(float)); + float *fsamples = malloc(N * sizeof(float)); + + for(int i=0; ivad_state->samples_per_frame){ + fsamples[i] = samples[i]; + } else { + fsamples[i] = 0; + } + } + + DSPSplitComplex tempSplitComplex; + tempSplitComplex.realp = malloc(N/2 * sizeof(float)); + tempSplitComplex.imagp = malloc(N/2 * sizeof(float)); + + //pack the real data into a split form for accelerate + vDSP_ctoz((DSPComplex*)fsamples, 2, &tempSplitComplex, 1, N/2); + + //do the FFT + vDSP_fft_zrip(self->fft_setup, &tempSplitComplex, 1, (int)log2(N), kFFTDirection_Forward); + + //get the magnitudes + vDSP_zvabs(&tempSplitComplex, 1, fft_mags, 1, N/2); + + //clear up memory + free(fsamples); + free(tempSplitComplex.realp); + free(tempSplitComplex.imagp); + + return fft_mags; +} + +@end diff --git a/package.json b/package.json index e2bbbe5a..96a9d064 100644 --- a/package.json +++ b/package.json @@ -5,9 +5,7 @@ "main": "index.js", "author": "Joshua Sierles (https://github.com/jsierles)", "files": [ - "ios/AudioRecorderManager.m", - "ios/AudioRecorderManager.h", - "ios/RNAudio.xcodeproj", + "ios/*", "README.md", "LICENSE", "index.js",