diff --git a/AudioExample/AudioExample.js b/AudioExample/AudioExample.js
index a65e04d6..a5419e87 100644
--- a/AudioExample/AudioExample.js
+++ b/AudioExample/AudioExample.js
@@ -20,18 +20,23 @@ class AudioExample extends Component {
recording: false,
stoppedRecording: false,
finished: false,
- audioPath: AudioUtils.DocumentDirectoryPath + '/test.aac',
+ audioPath: AudioUtils.DocumentDirectoryPath + '/test.wav',
hasPermission: undefined,
};
prepareRecordingPath(audioPath){
- AudioRecorder.prepareRecordingAtPath(audioPath, {
- SampleRate: 22050,
- Channels: 1,
- AudioQuality: "Low",
- AudioEncoding: "aac",
- AudioEncodingBitRate: 32000
- });
+ AudioRecorder.prepareStreamingAtPath(this.state.audioPath, 1600, {
+ SampleRate: 22050,
+ Channels: 1,
+ AudioSource: 'MIC',
+ // Following is not supported
+ // AudioQuality: "Low",
+ // AudioEncoding: "aac",
+ // AudioEncodingBitRate: 32000,
+ }, {
+ Sensitivity: 0,
+ Timeout: 7000,
+ });
}
componentDidMount() {
@@ -41,7 +46,8 @@ class AudioExample extends Component {
if (!hasPermission) return;
this.prepareRecordingPath(this.state.audioPath);
-
+ console.log(this.state.audioPath);
+ console.log(AudioRecorder);
AudioRecorder.onProgress = (data) => {
this.setState({currentTime: Math.floor(data.currentTime)});
};
@@ -52,6 +58,14 @@ class AudioExample extends Component {
this._finishRecording(data.status === "OK", data.audioFileURL);
}
};
+
+ AudioRecorder.onDataReceived = (data) => {
+ // console.log(data);
+ }
+
+ AudioRecorder.onVadReceived = (vadResult) => {
+ console.log(vadResult);
+ }
});
}
@@ -93,7 +107,7 @@ class AudioExample extends Component {
this.setState({stoppedRecording: true, recording: false});
try {
- const filePath = await AudioRecorder.pauseRecording();
+ const filePath = await AudioRecorder.pauseStreaming();
// Pause is currently equivalent to stop on Android.
if (Platform.OS === 'android') {
@@ -113,7 +127,7 @@ class AudioExample extends Component {
this.setState({stoppedRecording: true, recording: false});
try {
- const filePath = await AudioRecorder.stopRecording();
+ const filePath = await AudioRecorder.stopStreaming();
if (Platform.OS === 'android') {
this._finishRecording(true, filePath);
@@ -168,7 +182,7 @@ class AudioExample extends Component {
this.setState({recording: true});
try {
- const filePath = await AudioRecorder.startRecording();
+ const filePath = await AudioRecorder.startStreaming();
} catch (error) {
console.error(error);
}
diff --git a/AudioExample/iOS/AudioExample.xcodeproj/project.pbxproj b/AudioExample/iOS/AudioExample.xcodeproj/project.pbxproj
index 57c314fb..9ae1682b 100644
--- a/AudioExample/iOS/AudioExample.xcodeproj/project.pbxproj
+++ b/AudioExample/iOS/AudioExample.xcodeproj/project.pbxproj
@@ -528,8 +528,12 @@
TargetAttributes = {
00E356ED1AD99517003FC87E = {
CreatedOnToolsVersion = 6.2;
+ DevelopmentTeam = SD72YP83U5;
TestTargetID = 13B07F861A680F5B00A75B9A;
};
+ 13B07F861A680F5B00A75B9A = {
+ DevelopmentTeam = SD72YP83U5;
+ };
};
};
buildConfigurationList = 83CBB9FA1A601CBA00E9B192 /* Build configuration list for PBXProject "AudioExample" */;
@@ -878,6 +882,7 @@
isa = XCBuildConfiguration;
buildSettings = {
BUNDLE_LOADER = "$(TEST_HOST)";
+ DEVELOPMENT_TEAM = SD72YP83U5;
GCC_PREPROCESSOR_DEFINITIONS = (
"DEBUG=1",
"$(inherited)",
@@ -900,6 +905,7 @@
buildSettings = {
BUNDLE_LOADER = "$(TEST_HOST)";
COPY_PHASE_STRIP = NO;
+ DEVELOPMENT_TEAM = SD72YP83U5;
INFOPLIST_FILE = AudioExampleTests/Info.plist;
IPHONEOS_DEPLOYMENT_TARGET = 8.0;
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks @loader_path/Frameworks";
@@ -919,6 +925,7 @@
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
CURRENT_PROJECT_VERSION = 1;
DEAD_CODE_STRIPPING = NO;
+ DEVELOPMENT_TEAM = SD72YP83U5;
INFOPLIST_FILE = AudioExample/Info.plist;
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
OTHER_LDFLAGS = (
@@ -936,6 +943,7 @@
buildSettings = {
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
CURRENT_PROJECT_VERSION = 1;
+ DEVELOPMENT_TEAM = SD72YP83U5;
INFOPLIST_FILE = AudioExample/Info.plist;
LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
OTHER_LDFLAGS = (
diff --git a/AudioExample/iOS/AudioExample/Images.xcassets/AppIcon.appiconset/Contents.json b/AudioExample/iOS/AudioExample/Images.xcassets/AppIcon.appiconset/Contents.json
index 118c98f7..b8236c65 100644
--- a/AudioExample/iOS/AudioExample/Images.xcassets/AppIcon.appiconset/Contents.json
+++ b/AudioExample/iOS/AudioExample/Images.xcassets/AppIcon.appiconset/Contents.json
@@ -1,5 +1,15 @@
{
"images" : [
+ {
+ "idiom" : "iphone",
+ "size" : "20x20",
+ "scale" : "2x"
+ },
+ {
+ "idiom" : "iphone",
+ "size" : "20x20",
+ "scale" : "3x"
+ },
{
"idiom" : "iphone",
"size" : "29x29",
diff --git a/AudioExample/iOS/AudioExample/Info.plist b/AudioExample/iOS/AudioExample/Info.plist
index 2fb6a11c..4728718c 100644
--- a/AudioExample/iOS/AudioExample/Info.plist
+++ b/AudioExample/iOS/AudioExample/Info.plist
@@ -38,6 +38,8 @@
NSLocationWhenInUseUsageDescription
+ NSMicrophoneUsageDescription
+
NSAppTransportSecurity
diff --git a/android/build.gradle b/android/build.gradle
index 22023145..56d21662 100644
--- a/android/build.gradle
+++ b/android/build.gradle
@@ -9,6 +9,10 @@ android {
targetSdkVersion 23
versionCode 1
versionName "1.0"
+ ndk {
+ moduleName "witvad"
+ ldLibs "log"
+ }
}
buildTypes {
release {
@@ -21,4 +25,5 @@ dependencies {
compile fileTree(include: ['*.jar'], dir: 'libs')
compile 'com.android.support:appcompat-v7:23.1.0'
compile 'com.facebook.react:react-native:+'
+ compile 'com.github.wendykierp:JTransforms:3.0'
}
diff --git a/android/src/main/java/com/rnim/rn/audio/AudioRecorderManager.java b/android/src/main/java/com/rnim/rn/audio/AudioRecorderManager.java
index 33c65d0f..5b266a75 100644
--- a/android/src/main/java/com/rnim/rn/audio/AudioRecorderManager.java
+++ b/android/src/main/java/com/rnim/rn/audio/AudioRecorderManager.java
@@ -1,7 +1,6 @@
package com.rnim.rn.audio;
import android.Manifest;
-import android.content.Context;
import com.facebook.react.bridge.ReactApplicationContext;
import com.facebook.react.bridge.ReactContextBaseJavaModule;
@@ -10,25 +9,24 @@
import com.facebook.react.bridge.Arguments;
import com.facebook.react.bridge.Promise;
import com.facebook.react.bridge.ReadableMap;
+import com.facebook.react.bridge.WritableArray;
import com.facebook.react.bridge.WritableMap;
import java.io.File;
-import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;
import android.content.pm.PackageManager;
+import android.media.AudioFormat;
+import android.os.AsyncTask;
import android.os.Environment;
import android.media.MediaRecorder;
-import android.media.AudioManager;
-import android.support.v4.app.ActivityCompat;
import android.support.v4.content.ContextCompat;
import android.util.Log;
-import com.facebook.react.modules.core.DeviceEventManagerModule;
-import java.io.FileInputStream;
+import com.facebook.react.modules.core.DeviceEventManagerModule;
class AudioRecorderManager extends ReactContextBaseJavaModule {
@@ -42,17 +40,19 @@ class AudioRecorderManager extends ReactContextBaseJavaModule {
private static final String MusicDirectoryPath = "MusicDirectoryPath";
private static final String DownloadsDirectoryPath = "DownloadsDirectoryPath";
- private Context context;
- private MediaRecorder recorder;
private String currentOutputFile;
private boolean isRecording = false;
private Timer timer;
private int recorderSecondsElapsed;
+ // For AudioRecord Class
+ private RecordWaveTask recordTask = null;
public AudioRecorderManager(ReactApplicationContext reactContext) {
super(reactContext);
- this.context = reactContext;
+ if (recordTask == null) {
+ recordTask = new RecordWaveTask();
+ }
}
@Override
@@ -81,57 +81,24 @@ public void checkAuthorizationStatus(Promise promise) {
promise.resolve(permissionGranted);
}
- @ReactMethod
- public void prepareRecordingAtPath(String recordingPath, ReadableMap recordingSettings, Promise promise) {
- if (isRecording){
- logAndRejectPromise(promise, "INVALID_STATE", "Please call stopRecording before starting recording");
- }
-
- recorder = new MediaRecorder();
- try {
- recorder.setAudioSource(MediaRecorder.AudioSource.MIC);
- int outputFormat = getOutputFormatFromString(recordingSettings.getString("OutputFormat"));
- recorder.setOutputFormat(outputFormat);
- int audioEncoder = getAudioEncoderFromString(recordingSettings.getString("AudioEncoding"));
- recorder.setAudioEncoder(audioEncoder);
- recorder.setAudioSamplingRate(recordingSettings.getInt("SampleRate"));
- recorder.setAudioChannels(recordingSettings.getInt("Channels"));
- recorder.setAudioEncodingBitRate(recordingSettings.getInt("AudioEncodingBitRate"));
- recorder.setOutputFile(recordingPath);
- }
- catch(final Exception e) {
- logAndRejectPromise(promise, "COULDNT_CONFIGURE_MEDIA_RECORDER" , "Make sure you've added RECORD_AUDIO permission to your AndroidManifest.xml file "+e.getMessage());
- return;
- }
-
- currentOutputFile = recordingPath;
- try {
- recorder.prepare();
- promise.resolve(currentOutputFile);
- } catch (final Exception e) {
- logAndRejectPromise(promise, "COULDNT_PREPARE_RECORDING_AT_PATH "+recordingPath, e.getMessage());
- }
-
- }
-
private int getAudioEncoderFromString(String audioEncoder) {
- switch (audioEncoder) {
- case "aac":
- return MediaRecorder.AudioEncoder.AAC;
- case "aac_eld":
- return MediaRecorder.AudioEncoder.AAC_ELD;
- case "amr_nb":
- return MediaRecorder.AudioEncoder.AMR_NB;
- case "amr_wb":
- return MediaRecorder.AudioEncoder.AMR_WB;
- case "he_aac":
- return MediaRecorder.AudioEncoder.HE_AAC;
- case "vorbis":
- return MediaRecorder.AudioEncoder.VORBIS;
- default:
- Log.d("INVALID_AUDIO_ENCODER", "USING MediaRecorder.AudioEncoder.DEFAULT instead of "+audioEncoder+": "+MediaRecorder.AudioEncoder.DEFAULT);
- return MediaRecorder.AudioEncoder.DEFAULT;
- }
+ switch (audioEncoder) {
+ case "aac":
+ return MediaRecorder.AudioEncoder.AAC;
+ case "aac_eld":
+ return MediaRecorder.AudioEncoder.AAC_ELD;
+ case "amr_nb":
+ return MediaRecorder.AudioEncoder.AMR_NB;
+ case "amr_wb":
+ return MediaRecorder.AudioEncoder.AMR_WB;
+ case "he_aac":
+ return MediaRecorder.AudioEncoder.HE_AAC;
+ case "vorbis":
+ return MediaRecorder.AudioEncoder.VORBIS;
+ default:
+ Log.d("INVALID_AUDIO_ENCODER", "USING MediaRecorder.AudioEncoder.DEFAULT instead of "+audioEncoder+": "+MediaRecorder.AudioEncoder.DEFAULT);
+ return MediaRecorder.AudioEncoder.DEFAULT;
+ }
}
private int getOutputFormatFromString(String outputFormat) {
@@ -156,52 +123,144 @@ private int getOutputFormatFromString(String outputFormat) {
}
@ReactMethod
- public void startRecording(Promise promise){
- if (recorder == null){
- logAndRejectPromise(promise, "RECORDING_NOT_PREPARED", "Please call prepareRecordingAtPath before starting recording");
- return;
+ public void prepareStreamingAtPath(String recordingPath, int bufferSize, ReadableMap recordingSettings, ReadableMap vadSettings, Promise promise) {
+
+ try {
+ File wavFile = new File(recordingPath);
+ recordTask = new RecordWaveTask();
+
+ if (recordingSettings.hasKey("AudioSource")) {
+ switch(recordingSettings.getString("AudioSource")) {
+ case "DEFAULT":
+ recordTask.setAudioSource(MediaRecorder.AudioSource.DEFAULT);
+ break;
+ case "MIC":
+ recordTask.setAudioSource(MediaRecorder.AudioSource.MIC);
+ break;
+ case "VOICE_RECOGNITION":
+ recordTask.setAudioSource(MediaRecorder.AudioSource.VOICE_RECOGNITION);
+ break;
+ default:
+ recordTask.setAudioSource(MediaRecorder.AudioSource.DEFAULT);
+ break;
+ }
+ }
+
+ if (recordingSettings.hasKey("SampleRate")) {
+ recordTask.setSampleRate(recordingSettings.getInt("SampleRate"));
+ }
+
+ if (recordingSettings.hasKey("Channels")) {
+ int channels = recordingSettings.getInt("Channels");
+ int channelMask = AudioFormat.CHANNEL_IN_STEREO;
+ if (channels == 1) {
+ channelMask = AudioFormat.CHANNEL_IN_MONO;
+ }
+ recordTask.setChannelMask(channelMask);
+ }
+
+ if (vadSettings.hasKey("Sensitivity")) {
+ int vadSensitivity = vadSettings.getInt("Sensitivity");
+ recordTask.setVadSensitivity(vadSensitivity);
+ }
+
+ if (vadSettings.hasKey("Timeout")) {
+ int vadTimeout = vadSettings.getInt("Timeout");
+ recordTask.setVadTimeout(vadTimeout);
+ }
+
+ recordTask.setBufferSize(bufferSize);
+
+ recordTask.setOutputFile(wavFile);
+ recordTask.setStreamListener(new RecordWaveTask.OnStreamListener() {
+
+ @Override
+ public void onDataReceived(short[] buffer) {
+ Log.d("onDataReceived", buffer.length + "");
+ WritableArray body = Arguments.createArray();
+ for (short value: buffer) {
+ body.pushInt((int) value);
+ }
+ sendEvent("dataReceived", body);
+ }
+ });
+
+ recordTask.setVadListener(new RecordWaveTask.OnVadListener() {
+
+ @Override
+ public void onVadReceived(int vadResult) {
+ Log.d("onVadReceived", vadResult + "");
+ // WritableMap body = Arguments.createMap();
+ // body.putInt("vadResult", vadResult);
+ sendEvent("vadReceived", vadResult);
+ }
+ });
+
+ // int outputFormat = getOutputFormatFromString(recordingSettings.getString("OutputFormat"));
+ // recorder.setOutputFormat(outputFormat);
+ // int audioEncoder = getAudioEncoderFromString(recordingSettings.getString("AudioEncoding"));
+ // recorder.setAudioEncoder(audioEncoder);
+ // recorder.setAudioEncodingBitRate(recordingSettings.getInt("AudioEncodingBitRate"));
}
- if (isRecording){
- logAndRejectPromise(promise, "INVALID_STATE", "Please call stopRecording before starting recording");
+ catch(final Exception e) {
+ logAndRejectPromise(promise, "COULDNT_CONFIGURE_MEDIA_RECORDER" , "Make sure you've added RECORD_AUDIO permission to your AndroidManifest.xml file "+e.getMessage());
return;
}
- recorder.start();
- isRecording = true;
- startTimer();
- promise.resolve(currentOutputFile);
+
+ currentOutputFile = recordingPath;
}
@ReactMethod
- public void stopRecording(Promise promise){
- if (!isRecording){
- logAndRejectPromise(promise, "INVALID_STATE", "Please call startRecording before stopping recording");
+ public void startStreaming(Promise promise){
+ if (recordTask == null){
+ logAndRejectPromise(promise, "STREAMING_NOT_PREPARED", "Please call prepareStreamingAtPath before starting streaming");
return;
}
-
- stopTimer();
- isRecording = false;
-
- try {
- recorder.stop();
- recorder.release();
- }
- catch (final RuntimeException e) {
- // https://developer.android.com/reference/android/media/MediaRecorder.html#stop()
- logAndRejectPromise(promise, "RUNTIME_EXCEPTION", "No valid audio data received. You may be using a device that can't record audio.");
- return;
- }
- finally {
- recorder = null;
+ switch (recordTask.getStatus()) {
+ case RUNNING:
+ logAndRejectPromise(promise, "INVALID_STATE", "Please call stopStreaming before starting streaming");
+ return;
+ case FINISHED:
+ logAndRejectPromise(promise, "STREAMING_NOT_PREPARED", "Please call prepareStreamingAtPath before starting streaming");
+ break;
+ case PENDING:
+ // No Action
}
+ startTimer();
+ recordTask.execute();
+
+ isRecording = true;
promise.resolve(currentOutputFile);
- sendEvent("recordingFinished", null);
}
@ReactMethod
- public void pauseRecording(Promise promise){
+ public void stopStreaming(final Promise promise){
+ Log.d("RecordWaveTask", "stopStreaming");
+ if (recordTask != null && !recordTask.isCancelled() && recordTask.getStatus() == AsyncTask.Status.RUNNING) {
+ Log.d("RecordWaveTask", "stopStreaming2");
+ isRecording = false;
+ recordTask.setCancelCompleteListener(new RecordWaveTask.OnCancelCompleteListener() {
+ @Override
+ public void onCancelCompleted() {
+ Log.d("RecordWaveTask", "onCancelCompleted");
+ recordTask = null;
+ promise.resolve(currentOutputFile);
+ sendEvent("recordingFinished", null);
+ }
+ });
+ recordTask.cancel(false);
+ stopTimer();
+ } else {
+ Log.d("RecordWaveTask", "Task not running.");
+ logAndRejectPromise(promise, "INVALID_STATE", "Please call startStreaming before stopping streaming");
+ }
+ }
+
+ @ReactMethod
+ public void pauseStreaming(Promise promise){
// Added this function to have the same api for android and iOS, stops recording now
- stopRecording(promise);
+ stopStreaming(promise);
}
private void startTimer(){
@@ -226,7 +285,7 @@ private void stopTimer(){
timer = null;
}
}
-
+
private void sendEvent(String eventName, Object params) {
getReactApplicationContext()
.getJSModule(DeviceEventManagerModule.RCTDeviceEventEmitter.class)
diff --git a/android/src/main/java/com/rnim/rn/audio/RecordWaveTask.java b/android/src/main/java/com/rnim/rn/audio/RecordWaveTask.java
new file mode 100644
index 00000000..c961481e
--- /dev/null
+++ b/android/src/main/java/com/rnim/rn/audio/RecordWaveTask.java
@@ -0,0 +1,384 @@
+package com.rnim.rn.audio;
+
+import android.media.AudioFormat;
+import android.media.AudioRecord;
+import android.media.MediaRecorder;
+import android.os.AsyncTask;
+import android.os.SystemClock;
+import android.util.Log;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import java.util.Arrays;
+import org.jtransforms.fft.FloatFFT_1D;
+
+/**
+ * Created by KDH on 2017. 5. 15..
+ */
+
+public class RecordWaveTask extends AsyncTask {
+
+ // Default value
+ private int AUDIO_SOURCE = MediaRecorder.AudioSource.DEFAULT;
+ private int SAMPLE_RATE = 44100; // Hz
+ private int ENCODING = AudioFormat.ENCODING_PCM_16BIT;
+ private int CHANNEL_MASK = AudioFormat.CHANNEL_IN_MONO;
+ private int BUFFER_SIZE_IN_FRAME = 8192;
+ private int vadSensitivity = 0;
+ private int vadTimeout = 7000;
+ // int BUFFER_SIZE = 2 * AudioRecord.getMinBufferSize(SAMPLE_RATE, CHANNEL_MASK, ENCODING);
+
+ private File outputFile;
+
+ static {
+ System.loadLibrary("witvad");
+ }
+
+ public native int VadInit(int sampleRate, int vadSensitivity, int vadTimeout);
+ public native int VadStillTalking(short[] samples, float[] fft_mags);
+ public native int GetVadSamplesPerFrame();
+ public native void VadClean();
+
+ public RecordWaveTask() {}
+ public void setAudioSource(int audioSource) { this.AUDIO_SOURCE = audioSource; }
+
+ public void setSampleRate(int sampleRate) { this.SAMPLE_RATE = sampleRate; }
+
+ public void setEncoding(int encoding) { this.ENCODING = encoding; }
+
+ public void setChannelMask(int channelMask) { this.CHANNEL_MASK = channelMask; }
+
+ public void setOutputFile(File file) { this.outputFile = file; }
+
+ public void setBufferSize(int bufferSizeInFrame) { this.BUFFER_SIZE_IN_FRAME = bufferSizeInFrame; }
+
+ public void setVadSensitivity(int vadSensitivity) { this.vadSensitivity = vadSensitivity; }
+
+ public void setVadTimeout(int vadTimeout) { this.vadTimeout = vadTimeout; }
+
+ // Step 1 - This interface defines the type of messages I want to communicate to my owner
+ public interface OnCancelCompleteListener {
+ public void onCancelCompleted();
+ }
+ private OnCancelCompleteListener cancelCompleteListener = null;
+
+ public void setCancelCompleteListener(OnCancelCompleteListener listener) {
+ this.cancelCompleteListener = listener;
+ }
+
+ public interface OnStreamListener {
+ public void onDataReceived(short[] buffer);
+ }
+ private OnStreamListener streamListener = null;
+
+ public void setStreamListener(OnStreamListener listener) {
+ this.streamListener = listener;
+ }
+
+ public interface OnVadListener {
+ public void onVadReceived(int vadResult);
+ }
+ private OnVadListener vadListener = null;
+
+ public void setVadListener(OnVadListener listener) {
+ this.vadListener = listener;
+ }
+
+ /**
+ * Opens up the given file, writes the header, and keeps filling it with raw PCM bytes from
+ * AudioRecord until it reaches 4GB or is stopped by the user. It then goes back and updates
+ * the WAV header to include the proper final chunk sizes.
+ *
+ * @return Either an Exception (error) or two longs, the filesize, elapsed time in ms (success)
+ */
+ @Override
+ protected Object[] doInBackground(File... unused) {
+ AudioRecord audioRecord = null;
+ FileOutputStream wavOut = null;
+
+ long startTime = 0;
+ long endTime = 0;
+
+ try {
+ // Open our two resources
+ int bufferSizeInBytes = BUFFER_SIZE_IN_FRAME * 2;
+ audioRecord = new AudioRecord(AUDIO_SOURCE, SAMPLE_RATE, CHANNEL_MASK, ENCODING, bufferSizeInBytes);
+ wavOut = new FileOutputStream(this.outputFile);
+
+ // Write out the wav file header
+ writeWavHeader(wavOut, CHANNEL_MASK, SAMPLE_RATE, ENCODING);
+
+ // Avoiding loop allocations
+ short[] buffer = new short[BUFFER_SIZE_IN_FRAME];
+ boolean run = true;
+ int read;
+ long total = 0;
+ int vadResult;
+
+ VadInit(SAMPLE_RATE, vadSensitivity, vadTimeout);
+
+ FloatFFT_1D fft = new FloatFFT_1D(GetVadSamplesPerFrame());
+ float[] fft_mags = new float[GetVadSamplesPerFrame()/2];
+ float[] fft_modules = new float[GetVadSamplesPerFrame()];
+ short[] samples;
+
+ // Let's go
+ startTime = SystemClock.elapsedRealtime();
+ audioRecord.startRecording();
+ while (run && !isCancelled()) {
+ read = audioRecord.read(buffer, 0, buffer.length); // Count for 16 bit PCM
+
+ int samplesAnalyzed = 0;
+ while(samplesAnalyzed + GetVadSamplesPerFrame() < read){
+ samples = Arrays.copyOfRange(buffer, samplesAnalyzed, samplesAnalyzed +GetVadSamplesPerFrame());
+ for(int i=0; i 4 GB due to the use of 32 bit unsigned integers.
+ if (total + read > 4294967295L) {
+ // Write as many bytes as we can before hitting the max size
+ short[] tmpBuffer = new short[BUFFER_SIZE_IN_FRAME];
+ for (int i = 0; i < read && total <= 4294967295L; i++, total+=2) {
+ ByteBuffer byteBuffer = ByteBuffer.allocate(2);
+ byteBuffer.putShort(buffer[i]);
+ wavOut.write(byteBuffer.array());
+ tmpBuffer[i] = buffer[i];
+ }
+ if (this.streamListener != null) {
+ this.streamListener.onDataReceived(tmpBuffer);
+ }
+ run = false;
+ } else if (read >= 0) {
+ // Short array to byte array
+ ByteBuffer byteBuffer = ByteBuffer.allocate(buffer.length * 2);
+ byteBuffer.order(ByteOrder.LITTLE_ENDIAN);
+ byteBuffer.asShortBuffer().put(buffer);
+ byte[] bytes = byteBuffer.array();
+
+ wavOut.write(bytes, 0, read * 2);
+
+ total += (read * 2); // 2 Byte = Short
+ if (this.streamListener != null) {
+ Log.d("onDataReceived", "RecordWaveTask - " + read + "");
+ this.streamListener.onDataReceived(buffer.clone());
+ }
+ }
+ }
+ } catch (IOException ex) {
+ return new Object[]{ex};
+ } finally {
+ Log.d("RecordWaveTask", "Finally");
+ if (audioRecord != null) {
+ try {
+ if (audioRecord.getRecordingState() == AudioRecord.RECORDSTATE_RECORDING) {
+ VadClean();
+ audioRecord.stop();
+ Log.d("RecordWaveTask", "audioRecord.stop()");
+ endTime = SystemClock.elapsedRealtime();
+ }
+ } catch (IllegalStateException ex) {
+ //
+ }
+ if (audioRecord.getState() == AudioRecord.STATE_INITIALIZED) {
+ audioRecord.release();
+ }
+ }
+ if (wavOut != null) {
+ try {
+ wavOut.close();
+ Log.d("RecordWaveTask", "wavOut.close()");
+ } catch (IOException ex) {
+ Log.d("RecordWaveTask", ex.getMessage());
+ }
+ }
+ }
+
+ try {
+ // This is not put in the try/catch/finally above since it needs to run
+ // after we close the FileOutputStream
+ this.updateWavHeader(this.outputFile);
+ } catch (IOException ex) {
+ Log.d("RecordWaveTask", ex.getMessage());
+ return new Object[] { ex };
+ }
+
+ Log.d("RecordWaveTask", (endTime - startTime) + " sec" );
+ Log.d("RecordWaveTask", this.outputFile.length() + " byte" );
+
+ return new Object[] { this.outputFile.length(), endTime - startTime };
+ }
+
+ /**
+ * Writes the proper 44-byte RIFF/WAVE header to/for the given stream
+ * Two size fields are left empty/null since we do not yet know the final stream size
+ *
+ * @param out The stream to write the header to
+ * @param channelMask An AudioFormat.CHANNEL_* mask
+ * @param sampleRate The sample rate in hertz
+ * @param encoding An AudioFormat.ENCODING_PCM_* value
+ * @throws IOException
+ */
+ private static void writeWavHeader(OutputStream out, int channelMask, int sampleRate, int encoding) throws IOException {
+ short channels;
+ switch (channelMask) {
+ case AudioFormat.CHANNEL_IN_MONO:
+ channels = 1;
+ break;
+ case AudioFormat.CHANNEL_IN_STEREO:
+ channels = 2;
+ break;
+ default:
+ throw new IllegalArgumentException("Unacceptable channel mask");
+ }
+
+ short bitDepth;
+ switch (encoding) {
+ case AudioFormat.ENCODING_PCM_8BIT:
+ bitDepth = 8;
+ break;
+ case AudioFormat.ENCODING_PCM_16BIT:
+ bitDepth = 16;
+ break;
+ case AudioFormat.ENCODING_PCM_FLOAT:
+ bitDepth = 32;
+ break;
+ default:
+ throw new IllegalArgumentException("Unacceptable encoding");
+ }
+
+ writeWavHeader(out, channels, sampleRate, bitDepth);
+ }
+
+ /**
+ * Writes the proper 44-byte RIFF/WAVE header to/for the given stream
+ * Two size fields are left empty/null since we do not yet know the final stream size
+ *
+ * @param out The stream to write the header to
+ * @param channels The number of channels
+ * @param sampleRate The sample rate in hertz
+ * @param bitDepth The bit depth
+ * @throws IOException
+ */
+ private static void writeWavHeader(OutputStream out, short channels, int sampleRate, short bitDepth) throws IOException {
+ // Convert the multi-byte integers to raw bytes in little endian format as required by the spec
+ byte[] littleBytes = ByteBuffer
+ .allocate(14)
+ .order(ByteOrder.LITTLE_ENDIAN)
+ .putShort(channels)
+ .putInt(sampleRate)
+ .putInt(sampleRate * channels * (bitDepth / 8))
+ .putShort((short) (channels * (bitDepth / 8)))
+ .putShort(bitDepth)
+ .array();
+
+ // Not necessarily the best, but it's very easy to visualize this way
+ out.write(new byte[]{
+ // RIFF header
+ 'R', 'I', 'F', 'F', // ChunkID
+ 0, 0, 0, 0, // ChunkSize (must be updated later)
+ 'W', 'A', 'V', 'E', // Format
+ // fmt subchunk
+ 'f', 'm', 't', ' ', // Subchunk1ID
+ 16, 0, 0, 0, // Subchunk1Size
+ 1, 0, // AudioFormat
+ littleBytes[0], littleBytes[1], // NumChannels
+ littleBytes[2], littleBytes[3], littleBytes[4], littleBytes[5], // SampleRate
+ littleBytes[6], littleBytes[7], littleBytes[8], littleBytes[9], // ByteRate
+ littleBytes[10], littleBytes[11], // BlockAlign
+ littleBytes[12], littleBytes[13], // BitsPerSample
+ // data subchunk
+ 'd', 'a', 't', 'a', // Subchunk2ID
+ 0, 0, 0, 0, // Subchunk2Size (must be updated later)
+ });
+ }
+
+ /**
+ * Updates the given wav file's header to include the final chunk sizes
+ *
+ * @param wav The wav file to update
+ * @throws IOException
+ */
+ private static void updateWavHeader(File wav) throws IOException {
+ byte[] sizes = ByteBuffer
+ .allocate(8)
+ .order(ByteOrder.LITTLE_ENDIAN)
+ // There are probably a bunch of different/better ways to calculate
+ // these two given your circumstances. Cast should be safe since if the WAV is
+ // > 4 GB we've already made a terrible mistake.
+ .putInt((int) (wav.length() - 8)) // ChunkSize
+ .putInt((int) (wav.length() - 44)) // Subchunk2Size
+ .array();
+
+ RandomAccessFile accessWave = null;
+ //noinspection CaughtExceptionImmediatelyRethrown
+ try {
+ accessWave = new RandomAccessFile(wav, "rw");
+ // ChunkSize
+ accessWave.seek(4);
+ accessWave.write(sizes, 0, 4);
+
+ // Subchunk2Size
+ accessWave.seek(40);
+ accessWave.write(sizes, 4, 4);
+ } catch (IOException ex) {
+ // Rethrow but we still close accessWave in our finally
+ throw ex;
+ } finally {
+ if (accessWave != null) {
+ try {
+ accessWave.close();
+ } catch (IOException ex) {
+ //
+ }
+ }
+ }
+ }
+
+ @Override
+ protected void onCancelled(Object[] results) {
+ // Handling cancellations and successful runs in the same way
+ Log.d("RecordWaveTask", "onCancelled");
+ onPostExecute(results);
+ }
+
+ @Override
+ protected void onPostExecute(Object[] results) {
+ Log.d("RecordWaveTask", "onPostExecute");
+ Throwable throwable = null;
+ if (results[0] instanceof Throwable) {
+ // Error
+ throwable = (Throwable) results[0];
+ Log.e(RecordWaveTask.class.getSimpleName(), throwable.getMessage(), throwable);
+ }
+
+ if (cancelCompleteListener != null) {
+ cancelCompleteListener.onCancelCompleted();
+ }
+ }
+}
\ No newline at end of file
diff --git a/android/src/main/jni/WITCvad.c b/android/src/main/jni/WITCvad.c
new file mode 100644
index 00000000..91eabf30
--- /dev/null
+++ b/android/src/main/jni/WITCvad.c
@@ -0,0 +1,357 @@
+//
+// WITCvad.m
+// Wit
+//
+// Created by Anthony Kesich on 11/12/14.
+// Copyright (c) 2014 Willy Blandin. All rights reserved.
+//
+
+#include "WITCvad.h"
+
+
+/*
+ Adds value to the head of memory
+ */
+static void frame_memory_push(s_wv_detector_cvad_state *cvad_state, short int value);
+
+/*
+ Sums up the last N values of memory
+ */
+static int frame_memory_sum_last_n(s_wv_detector_cvad_state *cvad_state, int nb);
+
+
+int wvs_cvad_detect_talking(s_wv_detector_cvad_state *cvad_state, short int *samples, float *fft_mags)
+{
+ double dfc;
+ double band_energy[DETECTOR_CVAD_N_ENERGY_BANDS];
+ double sfm;
+ int fft_size = pow(2,floor(log2(cvad_state->samples_per_frame)));
+ short int counter;
+ int action = -1;
+ int zero_crossings;
+
+ //only process cvad_state->samples_per_frame samples at a time
+ //frames_detector_cvad_fft(samples, fft_modules, cvad_state->samples_per_frame);
+ dfc = frames_detector_cvad_most_dominant_freq(cvad_state, fft_mags, fft_size, cvad_state->samples_per_frame);
+ sfm = frames_detector_cvad_spectral_flatness(fft_mags, fft_size);
+ zero_crossings = frames_detector_cvad_zero_crossings(samples, cvad_state->samples_per_frame);
+ frames_detector_cvad_multiband_energy(cvad_state, fft_mags, fft_size, band_energy, cvad_state->samples_per_frame);
+
+ vw_detector_cvad_set_threshold(cvad_state);
+ counter = vw_detector_cvad_check_frame(cvad_state, band_energy, dfc, sfm, zero_crossings);
+ frame_memory_push(cvad_state, counter);
+
+ if ((counter < 3 && cvad_state->talking == 0) || !cvad_state->thresh_initialized) {
+ cvad_state->silence_count++;
+ //only update reference levels if we don't detect speech
+ wv_detector_cvad_update_ref_levels(cvad_state, band_energy, dfc, sfm);
+ }
+ if (cvad_state->thresh_initialized) {
+ int start_sum = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_START);
+ int stop_sum_long = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG);
+ int stop_sum_short = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT);
+ int speech_time = (cvad_state->frame_number-cvad_state->speech_start_frame) * cvad_state->samples_per_frame * 1000 / cvad_state->sample_freq;
+
+ if(start_sum > cvad_state->max_start_sum){
+ cvad_state->max_start_sum = start_sum;
+ }
+ if (!cvad_state->talking && start_sum >= cvad_state->start_sum_threshold ) {
+ cvad_state->talking = 1;
+ cvad_state->speech_start_frame = cvad_state->frame_number;
+ action = 1;
+ }
+ else if (cvad_state->talking && speech_time > DETECTOR_CVAD_MINIMUM_LENGTH
+ && ((counter < 3
+ && stop_sum_long <= cvad_state->max_start_sum*cvad_state->end_sum_long_coeff
+ && stop_sum_short <= cvad_state->max_start_sum*cvad_state->end_sum_short_coeff)
+ || (cvad_state->max_speech_time > 0
+ && speech_time >= cvad_state->max_speech_time))) {
+ cvad_state->talking = 0;
+ action = 0;
+ cvad_state->max_start_sum = 0;
+ }
+ }
+
+ cvad_state->frame_number++;
+
+ return action;
+}
+
+s_wv_detector_cvad_state* wv_detector_cvad_init(int sample_rate, int sensitivity, int speech_timeout)
+{
+ s_wv_detector_cvad_state *cvad_state = malloc(sizeof(s_wv_detector_cvad_state));
+ cvad_state->energy_thresh_coeff_lower = DETECTOR_CVAD_E_TH_COEFF_LOW_BAND;
+ cvad_state->energy_thresh_coeff_upper = DETECTOR_CVAD_E_TH_COEFF_UPPER_BANDS;
+ cvad_state->sfm_thresh= DETECTOR_CVAD_SFM_TH;
+ cvad_state->dfc_thresh= DETECTOR_CVAD_DFC_TH;
+ cvad_state->min_zero_crossings= DETECTOR_CVAD_MIN_ZERO_CROSSINGS;
+ cvad_state->max_zero_crossings= DETECTOR_CVAD_MAX_ZERO_CROSSINGS;
+ memset(cvad_state->energy_update_coeff, 0.20, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double));
+ memset(cvad_state->energy_prev_variance, -1, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double));
+ memset(cvad_state->energy_history, 0, DETECTOR_CVAD_ENERGY_MEMORY * DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double));
+ cvad_state->energy_history_index = 0;
+ cvad_state->dfc_update_coeff = 0.10;
+ cvad_state->sfm_update_coeff = 0.10;
+ cvad_state->frame_number = 0;
+ cvad_state->speech_start_frame = -1;
+ cvad_state->max_speech_time = speech_timeout;
+ cvad_state->thresh_initialized = 0;
+ cvad_state->silence_count = 0;
+ cvad_state->talking = 0;
+ memset(cvad_state->ref_energy, 0, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double));
+ cvad_state->ref_dfc = 0;
+ cvad_state->ref_sfm = 0;
+ memset(cvad_state->dfc_history, 0, DETECTOR_CVAD_FRAMES_INIT * sizeof(double));
+ cvad_state->sample_freq = sample_rate;
+ cvad_state->max_start_sum = 0;
+ cvad_state->samples_per_frame = pow(2,ceil(log2(cvad_state->sample_freq/150))); //around 100 frames per second, but must be a power of two
+ cvad_state->previous_state_index = 0;
+ memset(cvad_state->previous_state, 0, DETECTOR_CVAD_RESULT_MEMORY * sizeof(short int));
+
+ wv_detector_cvad_set_sensitivity(cvad_state, sensitivity);
+
+ return cvad_state;
+}
+
+void wv_detector_cvad_clean(s_wv_detector_cvad_state *cvad_state)
+{
+ free(cvad_state);
+}
+
+void wv_detector_cvad_set_sensitivity(s_wv_detector_cvad_state *cvad_state, int sensitivity)
+{
+ float sensitivity_frac = fmax(0,fmin(100,sensitivity))/100.0;
+ cvad_state->n_frames_check_start=DETECTOR_CVAD_N_FRAMES_CHECK_START;
+ cvad_state->n_frames_check_end_short=DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT;
+ cvad_state->n_frames_check_end_long=DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG;
+
+ cvad_state->start_sum_threshold = DETECTOR_CVAD_COUNT_SUM_START_SENSITIVE*sensitivity_frac;
+ cvad_state->start_sum_threshold += DETECTOR_CVAD_COUNT_SUM_START*(1-sensitivity_frac);
+
+ cvad_state->end_sum_short_coeff = DETECTOR_CVAD_COUNT_END_SHORT_FACTOR_SENSITIVE*sensitivity_frac;
+ cvad_state->end_sum_short_coeff += DETECTOR_CVAD_COUNT_END_SHORT_FACTOR*(1-sensitivity_frac);
+
+ cvad_state->end_sum_long_coeff = DETECTOR_CVAD_COUNT_END_LONG_FACTOR_SENSITIVE*sensitivity_frac;
+ cvad_state->end_sum_long_coeff += DETECTOR_CVAD_COUNT_END_LONG_FACTOR*(1-sensitivity_frac);
+}
+
+void wv_detector_cvad_update_ref_levels(s_wv_detector_cvad_state *cvad_state,
+ double *band_energy,
+ double dfc,
+ double sfm)
+{
+ int b=0;
+ if (!cvad_state->thresh_initialized) {
+ //if still initializing, accumulate values to average
+ for(b=0; bref_energy[b] += band_energy[b];
+ }
+
+
+ cvad_state->ref_sfm += sfm;
+
+ cvad_state->dfc_history[cvad_state->frame_number] = dfc > 0 ? log(dfc) : 0;
+ }
+
+ //record energy history
+ for(b=0; benergy_history[b][cvad_state->energy_history_index] = band_energy[b];
+ }
+ cvad_state->energy_history_index++;
+ cvad_state->energy_history_index%=DETECTOR_CVAD_ENERGY_MEMORY;
+
+ if (cvad_state->frame_number >= DETECTOR_CVAD_FRAMES_INIT) {
+ if(!cvad_state->thresh_initialized) {
+ //if done initializing, divide by number of samples to get an average
+ cvad_state->thresh_initialized = 1;
+ for(b=0; bref_energy[b] /= cvad_state->frame_number;
+ }
+
+ cvad_state->ref_sfm /= cvad_state->frame_number;
+
+ double sum = 0;
+ double sq_sum = 0;
+ for(b=0; bref_dfc+=cvad_state->dfc_history[b];
+ sum += cvad_state->dfc_history[b];
+ sq_sum += pow(cvad_state->dfc_history[b],2);
+ }
+ cvad_state->ref_dfc /= cvad_state->frame_number;
+ cvad_state->ref_dfc_var = (sq_sum-sum*sum/cvad_state->frame_number)/(cvad_state->frame_number -1);
+
+ } else if (cvad_state->talking == 0) {
+ //otherwise update thresholds based on adaptive rules if there's no speech
+ wv_detector_cvad_modify_update_coeffs(cvad_state);
+ for(b=0; bref_energy[b] *= (1-cvad_state->energy_update_coeff[b]);
+ cvad_state->ref_energy[b] += cvad_state->energy_update_coeff[b]*band_energy[b];
+ }
+
+ }
+ }
+
+}
+
+void vw_detector_cvad_set_threshold(s_wv_detector_cvad_state *cvad_state)
+{
+ //update thresholds to be a multiple of the reference level
+ int b;
+ cvad_state->th_energy[0] = cvad_state->ref_energy[0]*cvad_state->energy_thresh_coeff_lower;
+ for(b=1; bth_energy[b] = cvad_state->ref_energy[b]*cvad_state->energy_thresh_coeff_upper;
+ }
+ cvad_state->th_dfc = cvad_state->ref_dfc+cvad_state->dfc_thresh;
+ cvad_state->th_sfm = cvad_state->ref_sfm+cvad_state->sfm_thresh;
+}
+
+void wv_detector_cvad_modify_update_coeffs(s_wv_detector_cvad_state *cvad_state){
+ int b;
+ for(b=0; benergy_history[b][h];
+ sq_sum+=pow(cvad_state->energy_history[b][h],2);
+ }
+ double variance = (sq_sum-sum*sum/DETECTOR_CVAD_ENERGY_MEMORY)/(DETECTOR_CVAD_ENERGY_MEMORY-1);
+ double ratio = variance/cvad_state->energy_prev_variance[b];
+ if(ratio > 1.25){
+ cvad_state->energy_update_coeff[b] = 0.25;
+ } else if(ratio > 1.10){
+ cvad_state->energy_update_coeff[b] = 0.20;
+ } else if(ratio > 1.00){
+ cvad_state->energy_update_coeff[b] = 0.15;
+ } else if(ratio > 0.00){
+ cvad_state->energy_update_coeff[b] = 0.10;
+ } else {
+ //negative value indicates that this is the first pass of variance. Just set the coeff to 0.2
+ cvad_state->energy_update_coeff[b] = 0.20;
+ }
+ cvad_state->energy_prev_variance[b] = variance;
+ }
+}
+
+short int vw_detector_cvad_check_frame(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm, int zero_crossings)
+{
+ short int counter;
+
+ counter = 0;
+
+ int band_counter = 0;
+ if (band_energy[0] > cvad_state->th_energy[0]) {
+ counter += 2;
+ }
+
+ int b;
+ for(b=1; b cvad_state->th_energy[b]){
+ band_counter++;
+ }
+ }
+ if(band_counter >= 2){
+ counter+=2;
+ }
+
+ if (fabs((dfc > 0 ? log(dfc): 0) - cvad_state->ref_dfc) > cvad_state->ref_dfc_var) {
+ counter++;
+ }
+ if (sfm > cvad_state->th_sfm) {
+ counter++;
+ }
+ if(zero_crossings >= cvad_state->min_zero_crossings && zero_crossings <= cvad_state->max_zero_crossings){
+ counter++;
+ }
+
+ return counter;
+}
+
+
+double frames_detector_cvad_most_dominant_freq(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double nb_samples)
+{
+ double k = 0.0f;
+ double max = 0.0f;
+ double amplitude_minimum = 1.0f;
+ int i;
+
+ for (i = 0; i < nb_modules; i++) {
+ if (fft_mags[i] > max && fft_mags[i] > amplitude_minimum) {
+ max = fft_mags[i];
+ k = i;
+ }
+ }
+
+ return k * (double)cvad_state->sample_freq / (double)nb_samples;
+}
+
+void frames_detector_cvad_multiband_energy(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double *band_energy, int nb_samples){
+
+ int b = 0;
+ int k = 0;
+
+ for(b = 0; bsample_freq/nb_samples < 1000*(b+1)){
+ band_energy[b]+=fft_mags[k];
+ k++;
+ }
+ }
+
+}
+
+double frames_detector_cvad_spectral_flatness(float *fft_mags, int nb)
+{
+ double geo_mean = 0.0f;
+ double arithm_mean = 0.0f;
+ double sfm = 0.0f;
+ int i;
+
+ for (i = 0; i < nb; i++) {
+ if (fft_mags[i] != 0.0f) {
+ geo_mean += log(fft_mags[i]);
+ arithm_mean += fft_mags[i];
+ }
+ }
+ geo_mean = exp(geo_mean / (double) nb);
+ arithm_mean = arithm_mean / (double) nb;
+ sfm = 10 * log10(geo_mean / arithm_mean);
+ sfm = fabs(sfm);
+
+ return sfm;
+}
+
+int frames_detector_cvad_zero_crossings(short int *samples, int nb){
+ int num_zero_crossings = 0;
+ int i;
+
+ for(i=1; iprevious_state[cvad_state->previous_state_index] = value;
+ cvad_state->previous_state_index++;
+ cvad_state->previous_state_index%=DETECTOR_CVAD_RESULT_MEMORY;
+}
+
+static int frame_memory_sum_last_n(s_wv_detector_cvad_state *cvad_state, int nb)
+{
+ int i = 0;
+ int sum = 0;
+
+ for (i = 0; i < nb; i++) {
+ int indx = (cvad_state->previous_state_index - (i+1) + DETECTOR_CVAD_RESULT_MEMORY) % DETECTOR_CVAD_RESULT_MEMORY;
+ sum += cvad_state->previous_state[indx];
+ }
+
+ return sum;
+}
+
diff --git a/android/src/main/jni/WITCvad.h b/android/src/main/jni/WITCvad.h
new file mode 100644
index 00000000..48795946
--- /dev/null
+++ b/android/src/main/jni/WITCvad.h
@@ -0,0 +1,169 @@
+//
+// WITCvad.h
+// Wit
+//
+// Created by Anthony Kesich on 11/12/14.
+// Copyright (c) 2014 Willy Blandin. All rights reserved.
+//
+
+#ifndef Wit_WITCvad_h
+#define Wit_WITCvad_h
+
+
+#include
+#include
+#include
+#include
+
+
+/*
+ * This speech algorithm looks at multiple auditory compenents related to speech:
+ * - Energy divided into 1 KHz bands
+ * - Dominant Frequency Component
+ * - Spectral Flatness Measure
+ * - Zero-crossings
+ *
+ * If many features of speech are present for a period of time (~150 ms), speech is detected.
+ * The end of speech is determined by most features of speech disappearing for an extended period of time (~1 sec)
+ */
+
+#define DETECTOR_CVAD_FRAMES_INIT 40 /* number of frames to use to initialize values */
+#define DETECTOR_CVAD_E_TH_COEFF_LOW_BAND 2.5f /* Energy threshold coefficient */
+#define DETECTOR_CVAD_E_TH_COEFF_UPPER_BANDS 2.0f /* Energy threshold coefficient */
+#define DETECTOR_CVAD_SFM_TH 3.0f /* Spectral Flatness Measure threshold */
+#define DETECTOR_CVAD_DFC_TH 250.0f /* most Dominant Frequency Component threshold */
+#define DETECTOR_CVAD_MIN_ZERO_CROSSINGS 5 /* fewest zero crossings for speech */
+#define DETECTOR_CVAD_MAX_ZERO_CROSSINGS 15 /* maximum zero crossings for speech */
+#define DETECTOR_CVAD_RESULT_MEMORY 130 /* number of frame results to keep in memory */
+#define DETECTOR_CVAD_ENERGY_MEMORY 20 /* number of frame results to keep in memory */
+#define DETECTOR_CVAD_N_ENERGY_BANDS 5 /* number of 1 KHz energy bands to compute */
+#define DETECTOR_CVAD_MINIMUM_LENGTH 1000 /* minimum length of vad in ms */
+
+//final speech detection variables
+#define DETECTOR_CVAD_N_FRAMES_CHECK_START 15
+#define DETECTOR_CVAD_COUNT_SUM_START 4.5*DETECTOR_CVAD_N_FRAMES_CHECK_START
+#define DETECTOR_CVAD_COUNT_SUM_START_SENSITIVE 3.8*DETECTOR_CVAD_N_FRAMES_CHECK_START
+#define DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT 1.5*DETECTOR_CVAD_N_FRAMES_CHECK_START
+#define DETECTOR_CVAD_COUNT_END_SHORT_FACTOR 0.6
+#define DETECTOR_CVAD_COUNT_END_SHORT_FACTOR_SENSITIVE 0.3
+#define DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG 6.5*DETECTOR_CVAD_N_FRAMES_CHECK_START
+#define DETECTOR_CVAD_COUNT_END_LONG_FACTOR 1.8
+#define DETECTOR_CVAD_COUNT_END_LONG_FACTOR_SENSITIVE 1.5
+
+typedef struct {
+ double energy_thresh_coeff_lower;
+ double energy_thresh_coeff_upper;
+ double sfm_thresh;
+ double dfc_thresh;
+ double th_energy[DETECTOR_CVAD_N_ENERGY_BANDS];
+ double th_sfm;
+ double th_dfc;
+ double ref_energy[DETECTOR_CVAD_N_ENERGY_BANDS];
+ double ref_sfm;
+ double ref_dfc;
+ double ref_dfc_var;
+ double energy_update_coeff[DETECTOR_CVAD_N_ENERGY_BANDS];
+ double energy_prev_variance[DETECTOR_CVAD_N_ENERGY_BANDS];
+ double energy_history[DETECTOR_CVAD_N_ENERGY_BANDS][DETECTOR_CVAD_ENERGY_MEMORY];
+ double sfm_update_coeff;
+ double dfc_history[DETECTOR_CVAD_FRAMES_INIT];
+ double dfc_update_coeff;
+ float end_sum_long_coeff;
+ float end_sum_short_coeff;
+ int frame_number;
+ int speech_start_frame;
+ int max_speech_time;
+ int energy_history_index;
+ int min_zero_crossings;
+ int max_zero_crossings;
+ int thresh_initialized;
+ int silence_count;
+ int talking;
+ int sample_freq;
+ int samples_per_frame;
+ int max_start_sum;
+ int n_frames_check_start;
+ int n_frames_check_end_short;
+ int n_frames_check_end_long;
+ int start_sum_threshold;
+ int previous_state_index;
+ short int previous_state[DETECTOR_CVAD_RESULT_MEMORY];
+} s_wv_detector_cvad_state;
+
+/*
+ Main entry point to the detection algorithm.
+ This returns a -1 if there is no change in state, a 1 if some started talking, and a 0 if speech ended
+ */
+int wvs_cvad_detect_talking(s_wv_detector_cvad_state *cvad_state, short int *samples, float *fft_mags);
+
+
+/*
+ Initiate the cvad_state structure, which represents the state of
+ one instance of the algorithm
+
+ sensitive mode: 0 if for a close-up mic, 1 if for a fixed, distant mic
+ */
+s_wv_detector_cvad_state* wv_detector_cvad_init(int sample_rate, int sensitivity, int speech_timeout);
+
+/*
+ Safely frees memory for a cvad_state
+ */
+void wv_detector_cvad_clean(s_wv_detector_cvad_state *cvad_state);
+
+/*
+ Set VAD sensitivity (0-100)
+ - Lower values are for strong voice signals like for a cellphone or personal mic
+ - Higher values are for use with a fixed-position mic or any application with voice burried in ambient noise
+ - Defaults to 0
+ */
+
+void wv_detector_cvad_set_sensitivity(s_wv_detector_cvad_state *cvad_state, int sensitivity);
+
+/*
+ Set the reference values of the energy, most dominant frequency componant and the spectral flatness measure.
+ The threshold value is then set based on the "background" reference levels
+ */
+void wv_detector_cvad_update_ref_levels(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm);
+
+/*
+ Set the threshhold on the cvad_state.
+ */
+void vw_detector_cvad_set_threshold(s_wv_detector_cvad_state *cvad_state);
+
+/*
+ Computes the variance of the energy over the past few windows and adapts the update ceoffs accordingly
+ */
+void wv_detector_cvad_modify_update_coeffs(s_wv_detector_cvad_state *cvad_state);
+
+/*
+ Compare the distance between the value and the minimum value of each component and return how many
+ component(s) reponded positiviely.
+ Each frame with more than 2 (out of 3) matching features are qualified as a speech frame.
+ example : energy - cvad_state->min_energy > cvad_state->th_energy
+ */
+short int vw_detector_cvad_check_frame(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm, int zero_crossings);
+
+/*
+ Return the frequency with the biggest amplitude (from a frame).
+ */
+double frames_detector_cvad_most_dominant_freq(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double nb_samples);
+
+/*
+ Computes the energy of the first DETECTOR_CVAD_N_ENERGY_BANDS 1 KHz bands
+ */
+void frames_detector_cvad_multiband_energy(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double *band_energy, int nb_samples);
+
+/*
+ Compute the spectral flatness of a frame.
+ It tells us if all the frequencies have a similar amplitude, which would means noise
+ or if there is some dominant frequencies, which could mean voice.
+ */
+double frames_detector_cvad_spectral_flatness(float *fft_mags, int nb);
+
+/*
+ Counts the number of times the signal crosses zero
+ Even soft vocalizations have a fairly regular number of zero crossings (~5-15 for 10ms)
+ */
+int frames_detector_cvad_zero_crossings(short int *samples, int nb);
+
+#endif
diff --git a/android/src/main/jni/WITVadSimple.c b/android/src/main/jni/WITVadSimple.c
new file mode 100644
index 00000000..a94d8896
--- /dev/null
+++ b/android/src/main/jni/WITVadSimple.c
@@ -0,0 +1,206 @@
+//
+// WITVadSimple.c
+// Wit
+//
+// Created by Aric Lasry on 8/6/14.
+// Copyright (c) 2014 Willy Blandin. All rights reserved.
+//
+
+
+#include "WITVadSimple.h"
+
+/**
+ * wvs_pcm16short2dbfs - converts short (16 bits) samples to decibel full scale
+ * @samples: array of pcm 16 bits samples
+ * @size: numbers of sample
+ *
+ * Return a new allocated buffer of double, which will need to be free later
+ */
+static double * wvs_pcm16short2dbfs(short int *samples, int size);
+
+static double frames_detector_esf_energy(double *samples, int nb_samples);
+static void detector_esf_minimum(wvs_state *state, double energy, int n);
+static int detector_esf_check_frame(wvs_state *state, double energy);
+static void memory_push(int *memory, int length, int value);
+static int frame_memory_lte(int *memory, int value, int nb);
+static int frame_memory_gte(int *memory, int value, int nb);
+static int wvs_check(wvs_state *state, double *samples, int nb_samples);
+
+
+int wvs_still_talking(wvs_state *state, short int *samples, int nb_samples)
+{
+ double *dbfss;
+ double db;
+ int result;
+ int i_sample;
+
+ dbfss = wvs_pcm16short2dbfs(samples, nb_samples);
+ for (i_sample = 0; i_sample < nb_samples; i_sample++) {
+ db = dbfss[i_sample];
+ if (isinf(db)) {
+ continue;
+ }
+ if (state->current_nb_samples == state->samples_per_frame) {
+ result = wvs_check(state, state->samples, state->current_nb_samples);
+ if (result == 0 || result == 1) {
+ free(dbfss);
+ return result;
+ }
+ state->current_nb_samples = 0;
+ }
+ state->samples[state->current_nb_samples] = db;
+ state->current_nb_samples++;
+ }
+ free(dbfss);
+
+ return -1;
+}
+
+static int wvs_check(wvs_state *state, double *samples, int nb_samples)
+{
+ int counter;
+ double energy;
+ int action;
+ char debug_msg[128];
+
+ action = -1;
+ energy = frames_detector_esf_energy(samples, nb_samples);
+
+ if (state->sequence <= state->init_frames) {
+ detector_esf_minimum(state, energy, state->sequence);
+ }
+ counter = detector_esf_check_frame(state, energy);
+ if (state->sequence >= state->init_frames && !counter && !state->talking) {
+ detector_esf_minimum(state, energy, state->sequence);
+ }
+ memory_push(state->previous_state, state->previous_state_maxlen, counter);
+ if (state->sequence < state->init_frames) {
+ state->sequence++;
+ return -1;
+ }
+ if (state->talking == 0 && frame_memory_gte(state->previous_state, 1, 10)) {
+ state->talking = 1;
+ __android_log_write(ANDROID_LOG_DEBUG, "WitVAD", "Speak start");
+ action = 1;
+ }
+ else if (state->talking == 1 && frame_memory_lte(state->previous_state, 0, state->previous_state_maxlen)) {
+ state->talking = 0;
+ action = 0;
+ __android_log_write(ANDROID_LOG_DEBUG, "WitVAD", "Speak end");
+ }
+ state->sequence++;
+
+ return action;
+}
+
+wvs_state *wvs_init(double threshold, int sample_rate)
+{
+ wvs_state *state;
+
+ state = malloc(sizeof(*state));
+ state->sequence = 0;
+ state->min_initialized = 0;
+ state->init_frames = 30;
+ state->energy_threshold = threshold;
+ state->previous_state_maxlen = 50;
+ state->previous_state = malloc(sizeof(*state->previous_state) * state->previous_state_maxlen);
+ state->talking = 0;
+ state->sample_rate = sample_rate;
+ state->samples_per_frame = state->sample_rate / 100;
+ state->samples = malloc(sizeof(*state->samples) * state->samples_per_frame);
+ state->current_nb_samples = 0;
+ state->min_energy = 0.0;
+
+ return state;
+}
+
+void wvs_clean(wvs_state *state)
+{
+ free(state->samples);
+ free(state->previous_state);
+ free(state);
+}
+
+static double * wvs_pcm16short2dbfs(short int *samples, int size)
+{
+ double *dbfss;
+ double max_ref;
+ int i;
+
+ max_ref = 32768; //pow(2.0, 16.0) / 2; signed 16 bits w/o the -1
+ dbfss = malloc(sizeof(*dbfss) * size);
+
+ for (i = 0; i < size; i++) {
+ dbfss[i] = 0 - 20 * log10(fabs(samples[i] / max_ref));
+ }
+
+ return dbfss;
+}
+
+static double frames_detector_esf_energy(double *samples, int nb_samples)
+{
+ double energy = 0.0f;
+ int i;
+
+ for (i = 0; i < nb_samples; i++) {
+ energy += samples[i];
+ }
+ energy /= nb_samples;
+
+ return energy;
+}
+
+static void detector_esf_minimum(wvs_state *state, double energy, int n)
+{
+ n = (n > 10) ? 10 : n; //this correspond to 1/10 of a second
+ state->min_energy = (state->min_energy * n + energy) / (n + 1);
+ state->min_initialized = 1;
+}
+
+static int detector_esf_check_frame(wvs_state *state, double energy)
+{
+ int counter;
+
+ counter = 0;
+ char debug_msg[200];
+
+ if ((0 - (energy - state->min_energy)) >= state->energy_threshold) {
+ counter++;
+ }
+
+ return counter;
+}
+
+static void memory_push(int *memory, int length, int value)
+{
+ while (--length) {
+ memory[length] = memory[length - 1];
+ }
+ memory[0] = value;
+}
+
+static int frame_memory_gte(int *memory, int value, int nb)
+{
+ int i = 0;
+
+ for (i = 0; i < nb; i++) {
+ if (memory[i] < value) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static int frame_memory_lte(int *memory, int value, int nb)
+{
+ int i;
+
+ for (i = 0; i < nb; i++) {
+ if (memory[i] > value) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
diff --git a/android/src/main/jni/WITVadSimple.h b/android/src/main/jni/WITVadSimple.h
new file mode 100644
index 00000000..7c1a9802
--- /dev/null
+++ b/android/src/main/jni/WITVadSimple.h
@@ -0,0 +1,73 @@
+//
+// WITVadSimple.h
+// Wit
+//
+// Created by Aric Lasry on 8/6/14.
+// Copyright (c) 2014 Willy Blandin. All rights reserved.
+//
+
+#ifndef Wit_WITVadSimple_h
+#define Wit_WITVadSimple_h
+
+#include
+#include
+#include
+#include
+#include
+
+
+
+/**
+ * This voice activity detection is very simple. It computes the average of the
+ * audio powers from the beginning and the last second, and compare the distance
+ * between the two with a pre-defined threshold.
+ *
+ * The "audio powers" are average of audio chunks in DBFS. It could also be PCM samples...
+ */
+
+/*
+ state of the voice activity detection algorithm.
+ */
+typedef struct {
+ /* frame number */
+ int sequence;
+
+ /* is the environment initialized? */
+ int min_initialized;
+
+ /* frame number needed for initialization */
+ int init_frames;
+
+ double energy_threshold;
+
+ double min_energy;
+
+ int *previous_state;
+
+ int previous_state_maxlen;
+
+ int talking;
+
+ /* number of sample per second */
+ int sample_rate;
+
+ /* number of samples needed to calculate the feature(s) */
+ int samples_per_frame;
+
+ /* samples list to send to the checking function when enough are available */
+ double *samples;
+
+ int current_nb_samples;
+} wvs_state;
+
+int wvs_still_talking(wvs_state *state, short int *samples, int nb_samples);
+
+wvs_state *wvs_init(double threshold, int sample_rate);
+
+/**
+ * wvs_clean - clean a wvs_state* structure
+ * @state: the structure to free.
+ */
+void wvs_clean(wvs_state *state);
+
+#endif
diff --git a/android/src/main/jni/WITVadWrapper.c b/android/src/main/jni/WITVadWrapper.c
new file mode 100644
index 00000000..197b1bb2
--- /dev/null
+++ b/android/src/main/jni/WITVadWrapper.c
@@ -0,0 +1,55 @@
+#include "WITCvad.h"
+#include
+
+
+static s_wv_detector_cvad_state* wit_vad_g_struct = 0;
+
+int Java_com_rnim_rn_audio_RecordWaveTask_VadInit(JNIEnv *env, jobject obj, jint sample_rate, jint vadSensitivity, jint vadTimeout)
+{
+ vadSensitivity = (int)fmax(0,fmin(100,vadSensitivity)); //bounds-checking
+ wit_vad_g_struct = wv_detector_cvad_init(sample_rate, (int)vadSensitivity, (int)vadTimeout);
+
+ return 0;
+}
+
+
+int Java_com_rnim_rn_audio_RecordWaveTask_VadStillTalking(JNIEnv *env, jobject obj, jshortArray java_arr, jfloatArray java_fft_arr)
+{
+ short int *samples;
+ float *fft_mags;
+ int i, sum = 0;
+ int result;
+ jshort *native_arr = (*env)->GetShortArrayElements(env, java_arr, NULL);
+ jfloat *native_fft_arr = (*env)->GetFloatArrayElements(env, java_fft_arr, NULL);
+ int arr_len = wit_vad_g_struct->samples_per_frame;
+
+ samples = malloc(sizeof(*samples) * arr_len);
+ for (i = 0; i < arr_len; i++) {
+ samples[i] = native_arr[i];
+ }
+ (*env)->ReleaseShortArrayElements(env, java_arr, native_arr, 0);
+
+ fft_mags = malloc(sizeof(*fft_mags) * arr_len);
+ for (i = 0; i < arr_len/2; i++) {
+ fft_mags[i] = native_fft_arr[i];
+ }
+ (*env)->ReleaseFloatArrayElements(env, java_fft_arr, native_fft_arr, 0);
+
+ result = wvs_cvad_detect_talking(wit_vad_g_struct, samples, fft_mags);
+ free(samples);
+ free(fft_mags);
+
+ return result;
+}
+
+void Java_com_rnim_rn_audio_RecordWaveTask_VadClean()
+{
+ if (wit_vad_g_struct) {
+ wv_detector_cvad_clean(wit_vad_g_struct);
+ wit_vad_g_struct = 0;
+ }
+}
+
+int Java_com_rnim_rn_audio_RecordWaveTask_GetVadSamplesPerFrame(){
+ return wit_vad_g_struct->samples_per_frame;
+}
diff --git a/index.js b/index.js
index 450ed839..b3cfe295 100644
--- a/index.js
+++ b/index.js
@@ -12,7 +12,7 @@ import ReactNative, {
var AudioRecorderManager = NativeModules.AudioRecorderManager;
var AudioRecorder = {
- prepareRecordingAtPath: function(path, options) {
+ prepareStreamingAtPath: function(path, bufferSize=8192, options, vadOptions) {
if (this.progressSubscription) this.progressSubscription.remove();
this.progressSubscription = NativeAppEventEmitter.addListener('recordingProgress',
(data) => {
@@ -31,45 +31,76 @@ var AudioRecorder = {
}
);
+ if (this.dataReceivedSubscription) this.dataReceivedSubscription.remove();
+ this.dataReceivedSubscription = NativeAppEventEmitter.addListener('dataReceived',
+ (data) => {
+ if (this.onDataReceived) {
+ this.onDataReceived(data);
+ }
+ }
+ );
+
+ if (this.vadReceivedSubscription) this.vadReceivedSubscription.remove();
+ this.vadReceivedSubscription = NativeAppEventEmitter.addListener('vadReceived',
+ (vadResult) => {
+ if (this.onVadReceived) {
+ this.onVadReceived(vadResult);
+ }
+ }
+ );
+
var defaultOptions = {
SampleRate: 44100.0,
- Channels: 2,
+ Channels: 1,
AudioQuality: 'High',
AudioEncoding: 'ima4',
- OutputFormat: 'mpeg_4',
MeteringEnabled: false,
- AudioEncodingBitRate: 32000
+ AudioSource: 'DEFAULT',
+ // OutputFormat: 'mpeg_4',
+ // AudioEncodingBitRate: 32000
};
var recordingOptions = {...defaultOptions, ...options};
+ var defaultVadOptions = {
+ Sensitivity: 0,
+ Timeout: 7000,
+ }
+
+ var vadOptions = {...defaultVadOptions, ...vadOptions};
+
if (Platform.OS === 'ios') {
- AudioRecorderManager.prepareRecordingAtPath(
+ AudioRecorderManager.prepareStreamingAtPath(
path,
+ bufferSize,
recordingOptions.SampleRate,
recordingOptions.Channels,
recordingOptions.AudioQuality,
recordingOptions.AudioEncoding,
- recordingOptions.MeteringEnabled
+ recordingOptions.MeteringEnabled,
+ vadOptions.Sensitivity,
+ vadOptions.Timeout,
);
} else {
- return AudioRecorderManager.prepareRecordingAtPath(path, recordingOptions);
+ return AudioRecorderManager.prepareStreamingAtPath(path, bufferSize, recordingOptions, vadOptions);
}
},
- startRecording: function() {
- return AudioRecorderManager.startRecording();
+ startStreaming: function() {
+ return AudioRecorderManager.startStreaming();
},
- pauseRecording: function() {
- return AudioRecorderManager.pauseRecording();
+ stopStreaming: function() {
+ return AudioRecorderManager.stopStreaming();
},
- stopRecording: function() {
- return AudioRecorderManager.stopRecording();
+ pauseStreaming: function() {
+ return AudioRecorderManager.pauseStreaming();
},
checkAuthorizationStatus: AudioRecorderManager.checkAuthorizationStatus,
requestAuthorization: AudioRecorderManager.requestAuthorization,
removeListeners: function() {
if (this.progressSubscription) this.progressSubscription.remove();
if (this.finishedSubscription) this.finishedSubscription.remove();
+ if (this.dataReceivedSubscription) this.dataReceivedSubscription.remove();
+ if (this.vadReceivedSubscription) this.vadReceivedSubscription.remove();
},
};
diff --git a/ios/AudioRecorderManager.h b/ios/AudioRecorderManager.h
index d117e923..743e8e0c 100644
--- a/ios/AudioRecorderManager.h
+++ b/ios/AudioRecorderManager.h
@@ -9,7 +9,8 @@
#import
#import
#import
+#import "WITVad.h"
-@interface AudioRecorderManager : NSObject
+@interface AudioRecorderManager : NSObject
-@end
\ No newline at end of file
+@end
diff --git a/ios/AudioRecorderManager.m b/ios/AudioRecorderManager.m
index c8e9bbb8..355cb192 100644
--- a/ios/AudioRecorderManager.m
+++ b/ios/AudioRecorderManager.m
@@ -12,13 +12,17 @@
#import
#import
#import
+#import "StreamingModule.h"
+#import "WITVad.h"
NSString *const AudioRecorderEventProgress = @"recordingProgress";
NSString *const AudioRecorderEventFinished = @"recordingFinished";
+NSString *const AudioRecorderEventDataReceived = @"dataReceived";
+NSString *const AudioRecorderEventVadReceived = @"vadReceived";
@implementation AudioRecorderManager {
- AVAudioRecorder *_audioRecorder;
+ // AVAudioRecorder *_audioRecorder;
NSTimeInterval _currentTime;
id _progressUpdateTimer;
@@ -29,17 +33,22 @@ @implementation AudioRecorderManager {
NSNumber *_audioEncoding;
NSNumber *_audioChannels;
NSNumber *_audioSampleRate;
- AVAudioSession *_recordSession;
BOOL _meteringEnabled;
+ int _bufferSize;
+ int _vadSensitivity;
+ int _vadTimeout;
}
+StreamingModule* streamingModule;
+WITVad *vad;
+
@synthesize bridge = _bridge;
RCT_EXPORT_MODULE();
- (void)sendProgressUpdate {
- if (_audioRecorder && _audioRecorder.recording) {
- _currentTime = _audioRecorder.currentTime;
+ if (streamingModule && streamingModule->recording) {
+ _currentTime = streamingModule->currentTime;
} else {
return;
}
@@ -48,11 +57,13 @@ - (void)sendProgressUpdate {
(([_prevProgressUpdateTime timeIntervalSinceNow] * -1000.0) >= _progressUpdateInterval)) {
NSMutableDictionary *body = [[NSMutableDictionary alloc] init];
[body setObject:[NSNumber numberWithFloat:_currentTime] forKey:@"currentTime"];
+ /*
if (_meteringEnabled) {
[_audioRecorder updateMeters];
float _currentMetering = [_audioRecorder averagePowerForChannel: 0];
[body setObject:[NSNumber numberWithFloat:_currentMetering] forKey:@"currentMetering"];
}
+ */
[self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventProgress body:body];
@@ -74,10 +85,10 @@ - (void)startProgressTimer {
[_progressUpdateTimer addToRunLoop:[NSRunLoop mainRunLoop] forMode:NSDefaultRunLoopMode];
}
-- (void)audioRecorderDidFinishRecording:(AVAudioRecorder *)recorder successfully:(BOOL)flag {
- [self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventFinished body:@{
- @"status": flag ? @"OK" : @"ERROR",
- @"audioFileURL": [_audioFileURL absoluteString]
+- (void)finishRecording:(BOOL)flag {
+ [self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventFinished body:@{
+ @"status": flag ? @"OK" : @"ERROR",
+ @"audioFileURL": [_audioFileURL absoluteString]
}];
}
@@ -88,124 +99,6 @@ - (NSString *) applicationDocumentsDirectory
return basePath;
}
-RCT_EXPORT_METHOD(prepareRecordingAtPath:(NSString *)path sampleRate:(float)sampleRate channels:(nonnull NSNumber *)channels quality:(NSString *)quality encoding:(NSString *)encoding meteringEnabled:(BOOL)meteringEnabled)
-{
- _prevProgressUpdateTime = nil;
- [self stopProgressTimer];
-
- _audioFileURL = [NSURL fileURLWithPath:path];
-
- // Default options
- _audioQuality = [NSNumber numberWithInt:AVAudioQualityHigh];
- _audioEncoding = [NSNumber numberWithInt:kAudioFormatAppleIMA4];
- _audioChannels = [NSNumber numberWithInt:2];
- _audioSampleRate = [NSNumber numberWithFloat:44100.0];
- _meteringEnabled = NO;
-
- // Set audio quality from options
- if (quality != nil) {
- if ([quality isEqual: @"Low"]) {
- _audioQuality =[NSNumber numberWithInt:AVAudioQualityLow];
- } else if ([quality isEqual: @"Medium"]) {
- _audioQuality =[NSNumber numberWithInt:AVAudioQualityMedium];
- } else if ([quality isEqual: @"High"]) {
- _audioQuality =[NSNumber numberWithInt:AVAudioQualityHigh];
- }
- }
-
- // Set channels from options
- if (channels != nil) {
- _audioChannels = channels;
- }
-
- // Set audio encoding from options
- if (encoding != nil) {
- if ([encoding isEqual: @"lpcm"]) {
- _audioEncoding =[NSNumber numberWithInt:kAudioFormatLinearPCM];
- } else if ([encoding isEqual: @"ima4"]) {
- _audioEncoding =[NSNumber numberWithInt:kAudioFormatAppleIMA4];
- } else if ([encoding isEqual: @"aac"]) {
- _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEG4AAC];
- } else if ([encoding isEqual: @"MAC3"]) {
- _audioEncoding =[NSNumber numberWithInt:kAudioFormatMACE3];
- } else if ([encoding isEqual: @"MAC6"]) {
- _audioEncoding =[NSNumber numberWithInt:kAudioFormatMACE6];
- } else if ([encoding isEqual: @"ulaw"]) {
- _audioEncoding =[NSNumber numberWithInt:kAudioFormatULaw];
- } else if ([encoding isEqual: @"alaw"]) {
- _audioEncoding =[NSNumber numberWithInt:kAudioFormatALaw];
- } else if ([encoding isEqual: @"mp1"]) {
- _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEGLayer1];
- } else if ([encoding isEqual: @"mp2"]) {
- _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEGLayer2];
- } else if ([encoding isEqual: @"alac"]) {
- _audioEncoding =[NSNumber numberWithInt:kAudioFormatAppleLossless];
- } else if ([encoding isEqual: @"amr"]) {
- _audioEncoding =[NSNumber numberWithInt:kAudioFormatAMR];
- }
- }
-
- // Set sample rate from options
- _audioSampleRate = [NSNumber numberWithFloat:sampleRate];
-
- NSDictionary *recordSettings = [NSDictionary dictionaryWithObjectsAndKeys:
- _audioQuality, AVEncoderAudioQualityKey,
- _audioEncoding, AVFormatIDKey,
- _audioChannels, AVNumberOfChannelsKey,
- _audioSampleRate, AVSampleRateKey,
- nil];
-
- // Enable metering from options
- if (meteringEnabled != NO) {
- _meteringEnabled = meteringEnabled;
- }
-
- NSError *error = nil;
-
- _recordSession = [AVAudioSession sharedInstance];
- [_recordSession setCategory:AVAudioSessionCategoryMultiRoute error:nil];
-
- _audioRecorder = [[AVAudioRecorder alloc]
- initWithURL:_audioFileURL
- settings:recordSettings
- error:&error];
-
- _audioRecorder.meteringEnabled = _meteringEnabled;
- _audioRecorder.delegate = self;
-
- if (error) {
- NSLog(@"error: %@", [error localizedDescription]);
- // TODO: dispatch error over the bridge
- } else {
- [_audioRecorder prepareToRecord];
- }
-}
-
-RCT_EXPORT_METHOD(startRecording)
-{
- if (!_audioRecorder.recording) {
- [self startProgressTimer];
- [_recordSession setActive:YES error:nil];
- [_audioRecorder record];
-
- }
-}
-
-RCT_EXPORT_METHOD(stopRecording)
-{
- [_audioRecorder stop];
- [_recordSession setActive:NO error:nil];
- _prevProgressUpdateTime = nil;
-}
-
-RCT_EXPORT_METHOD(pauseRecording)
-{
- if (_audioRecorder.recording) {
- [self stopProgressTimer];
- [_audioRecorder pause];
- }
-}
-
RCT_EXPORT_METHOD(checkAuthorizationStatus:(RCTPromiseResolveBlock)resolve reject:(__unused RCTPromiseRejectBlock)reject)
{
AVAudioSessionRecordPermission permissionStatus = [[AVAudioSession sharedInstance] recordPermission];
@@ -237,6 +130,146 @@ - (NSString *) applicationDocumentsDirectory
}];
}
+RCT_EXPORT_METHOD(prepareStreamingAtPath:(NSString *)path bufferSize:(int)bufferSize sampleRate:(float)sampleRate channels:(nonnull NSNumber *)channels quality:(NSString *)quality encoding:(NSString *)encoding meteringEnabled:(BOOL)meteringEnabled vadSensitivity:(int)vadSensitivity vadTimeout:(int)vadTimeout)
+{
+ NSLog(@"prepareStreaming");
+ _audioFileURL = [NSURL fileURLWithPath:path];
+
+ // Default options
+ _audioQuality = [NSNumber numberWithInt:AVAudioQualityHigh];
+ _audioEncoding = [NSNumber numberWithInt:kAudioFormatAppleIMA4];
+ _audioChannels = [NSNumber numberWithInt:1];
+ _audioSampleRate = [NSNumber numberWithFloat:44100.0];
+ _meteringEnabled = NO;
+ _bufferSize = 8192;
+ _vadSensitivity = 0;
+ _vadTimeout = 7000;
+
+ // Set audio quality from options
+ if (quality != nil) {
+ if ([quality isEqual: @"Low"]) {
+ _audioQuality =[NSNumber numberWithInt:AVAudioQualityLow];
+ } else if ([quality isEqual: @"Medium"]) {
+ _audioQuality =[NSNumber numberWithInt:AVAudioQualityMedium];
+ } else if ([quality isEqual: @"High"]) {
+ _audioQuality =[NSNumber numberWithInt:AVAudioQualityHigh];
+ }
+ }
+
+ // Set channels from options
+ if (channels != nil) {
+ _audioChannels = channels;
+ }
+
+ // Set audio encoding from options
+ if (encoding != nil) {
+ if ([encoding isEqual: @"lpcm"]) {
+ _audioEncoding =[NSNumber numberWithInt:kAudioFormatLinearPCM];
+ } else if ([encoding isEqual: @"ima4"]) {
+ _audioEncoding =[NSNumber numberWithInt:kAudioFormatAppleIMA4];
+ } else if ([encoding isEqual: @"aac"]) {
+ _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEG4AAC];
+ } else if ([encoding isEqual: @"MAC3"]) {
+ _audioEncoding =[NSNumber numberWithInt:kAudioFormatMACE3];
+ } else if ([encoding isEqual: @"MAC6"]) {
+ _audioEncoding =[NSNumber numberWithInt:kAudioFormatMACE6];
+ } else if ([encoding isEqual: @"ulaw"]) {
+ _audioEncoding =[NSNumber numberWithInt:kAudioFormatULaw];
+ } else if ([encoding isEqual: @"alaw"]) {
+ _audioEncoding =[NSNumber numberWithInt:kAudioFormatALaw];
+ } else if ([encoding isEqual: @"mp1"]) {
+ _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEGLayer1];
+ } else if ([encoding isEqual: @"mp2"]) {
+ _audioEncoding =[NSNumber numberWithInt:kAudioFormatMPEGLayer2];
+ } else if ([encoding isEqual: @"alac"]) {
+ _audioEncoding =[NSNumber numberWithInt:kAudioFormatAppleLossless];
+ } else if ([encoding isEqual: @"amr"]) {
+ _audioEncoding =[NSNumber numberWithInt:kAudioFormatAMR];
+ }
+ }
+
+ // Set sample rate from options
+ _audioSampleRate = [NSNumber numberWithFloat:sampleRate];
+
+ // Set buffer size from options
+ _bufferSize = bufferSize;
+
+ NSDictionary *recordSettings = [NSDictionary dictionaryWithObjectsAndKeys:
+ //_audioQuality, AVEncoderAudioQualityKey,
+ //_audioEncoding, AVFormatIDKey,
+ _audioChannels, AVNumberOfChannelsKey,
+ _audioSampleRate, AVSampleRateKey,
+ nil];
+
+ // Enable metering from options
+ if (meteringEnabled != NO) {
+ _meteringEnabled = meteringEnabled;
+ }
+
+ _vadSensitivity = vadSensitivity;
+ _vadTimeout = vadTimeout;
+
+ if (vad == nil) {
+ vad = [[WITVad alloc] initWithAudioSampleRate:[_audioSampleRate intValue]
+ vadSensitivity:_vadSensitivity
+ vadTimeout:_vadTimeout];
+ vad.delegate = self;
+ }
+
+ streamingModule = [[StreamingModule alloc] init];
+ [streamingModule prepare:_audioFileURL
+ bufferSize:_bufferSize
+ settings:recordSettings
+ handler:^(AVAudioPCMBuffer *buf){
+ NSMutableArray *body = [[NSMutableArray alloc] init];
+ for(int i=0; imBytesPerFrame;
+ NSData *audio = [[NSData alloc] initWithBytes:int16ChannelData length:length];
+ [vad gotAudioSamples:audio];
+ }
+ [self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventDataReceived body:body];
+ }
+ ];
+}
+
+RCT_EXPORT_METHOD(startStreaming)
+{
+ NSLog(@"startStreaming");
+ NSLog(@"%@", _audioFileURL);
+ [self startProgressTimer];
+ [[AVAudioSession sharedInstance] setActive:YES error:nil];
+ [streamingModule start];
+}
+
+RCT_EXPORT_METHOD(stopStreaming)
+{
+ NSLog(@"stopStreaming");
+ [streamingModule stop];
+ [[AVAudioSession sharedInstance] setActive:NO error:nil];
+ _prevProgressUpdateTime = nil;
+ if (vad) {
+ vad.delegate = nil;
+ vad = nil;
+ }
+ [self finishRecording: true];
+}
+
+RCT_EXPORT_METHOD(pauseStreaming)
+{
+ NSLog(@"pauseStreaming");
+ [self stopProgressTimer];
+ [streamingModule pause];
+}
+
+
- (NSString *)getPathForDirectory:(int)directory
{
NSArray *paths = NSSearchPathForDirectoriesInDomains(directory, NSUserDomainMask, YES);
@@ -253,4 +286,14 @@ - (NSDictionary *)constantsToExport
};
}
+-(void)vadStartedTalking {
+ NSLog(@"Started Talking");
+ [self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventVadReceived body:[NSNumber numberWithInt:1]];
+}
+
+-(void)vadStoppedTalking {
+ NSLog(@"Stopped Talking");
+ [self.bridge.eventDispatcher sendAppEventWithName:AudioRecorderEventVadReceived body:[NSNumber numberWithInt:0]];
+}
+
@end
diff --git a/ios/RNAudio.xcodeproj/project.pbxproj b/ios/RNAudio.xcodeproj/project.pbxproj
index 5e1326f0..d5178132 100644
--- a/ios/RNAudio.xcodeproj/project.pbxproj
+++ b/ios/RNAudio.xcodeproj/project.pbxproj
@@ -7,7 +7,11 @@
objects = {
/* Begin PBXBuildFile section */
+ 38D7625B1EDD3F58007B8DE3 /* StreamingModule.m in Sources */ = {isa = PBXBuildFile; fileRef = 38D762591EDD3F58007B8DE3 /* StreamingModule.m */; };
429D457A1CFC96E100CBD51A /* AudioRecorderManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 429D45761CFC96E100CBD51A /* AudioRecorderManager.m */; };
+ 7664CAD31F39482200FC59DE /* WITCvad.m in Sources */ = {isa = PBXBuildFile; fileRef = 7664CAD21F39482200FC59DE /* WITCvad.m */; };
+ 7664CAD61F394C8100FC59DE /* WITVad.m in Sources */ = {isa = PBXBuildFile; fileRef = 7664CAD51F394C8100FC59DE /* WITVad.m */; };
+ 76A04C0C1EDD91B800516515 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 76A04C0B1EDD91B800516515 /* AVFoundation.framework */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
@@ -23,9 +27,16 @@
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
+ 38D762591EDD3F58007B8DE3 /* StreamingModule.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = StreamingModule.m; sourceTree = SOURCE_ROOT; };
+ 38D7625A1EDD3F58007B8DE3 /* StreamingModule.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = StreamingModule.h; sourceTree = SOURCE_ROOT; };
429D45761CFC96E100CBD51A /* AudioRecorderManager.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = AudioRecorderManager.m; sourceTree = SOURCE_ROOT; };
429D45771CFC96E100CBD51A /* AudioRecorderManager.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AudioRecorderManager.h; sourceTree = SOURCE_ROOT; };
42F559BA1CFC90C400DC3F84 /* libRNAudio.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libRNAudio.a; sourceTree = BUILT_PRODUCTS_DIR; };
+ 7664CAD11F39482200FC59DE /* WITCvad.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WITCvad.h; sourceTree = SOURCE_ROOT; };
+ 7664CAD21F39482200FC59DE /* WITCvad.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = WITCvad.m; sourceTree = SOURCE_ROOT; };
+ 7664CAD41F394C8100FC59DE /* WITVad.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WITVad.h; sourceTree = SOURCE_ROOT; };
+ 7664CAD51F394C8100FC59DE /* WITVad.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = WITVad.m; sourceTree = SOURCE_ROOT; };
+ 76A04C0B1EDD91B800516515 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@@ -33,6 +44,7 @@
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
+ 76A04C0C1EDD91B800516515 /* AVFoundation.framework in Frameworks */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@@ -44,6 +56,7 @@
children = (
42F559BC1CFC90C400DC3F84 /* RNAudio */,
42F559BB1CFC90C400DC3F84 /* Products */,
+ 76A04C0A1EDD91B700516515 /* Frameworks */,
);
sourceTree = "";
};
@@ -58,12 +71,26 @@
42F559BC1CFC90C400DC3F84 /* RNAudio */ = {
isa = PBXGroup;
children = (
+ 7664CAD41F394C8100FC59DE /* WITVad.h */,
+ 7664CAD51F394C8100FC59DE /* WITVad.m */,
+ 7664CAD11F39482200FC59DE /* WITCvad.h */,
+ 7664CAD21F39482200FC59DE /* WITCvad.m */,
429D45761CFC96E100CBD51A /* AudioRecorderManager.m */,
429D45771CFC96E100CBD51A /* AudioRecorderManager.h */,
+ 38D762591EDD3F58007B8DE3 /* StreamingModule.m */,
+ 38D7625A1EDD3F58007B8DE3 /* StreamingModule.h */,
);
path = RNAudio;
sourceTree = "";
};
+ 76A04C0A1EDD91B700516515 /* Frameworks */ = {
+ isa = PBXGroup;
+ children = (
+ 76A04C0B1EDD91B800516515 /* AVFoundation.framework */,
+ );
+ name = Frameworks;
+ sourceTree = "";
+ };
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
@@ -120,6 +147,9 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
+ 38D7625B1EDD3F58007B8DE3 /* StreamingModule.m in Sources */,
+ 7664CAD31F39482200FC59DE /* WITCvad.m in Sources */,
+ 7664CAD61F394C8100FC59DE /* WITVad.m in Sources */,
429D457A1CFC96E100CBD51A /* AudioRecorderManager.m in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
diff --git a/ios/StreamingModule.h b/ios/StreamingModule.h
new file mode 100644
index 00000000..c3e38948
--- /dev/null
+++ b/ios/StreamingModule.h
@@ -0,0 +1,31 @@
+//
+// StreamingModule.h
+// RNAudio
+//
+// Created by JeungminOh on 30/05/2017.
+// Copyright © 2017 Joshua Sierles. All rights reserved.
+//
+
+#import
+
+@interface StreamingModule : NSObject
+{
+ AVAudioEngine *_engine;
+ void (^_audioDataReceived)(AVAudioPCMBuffer *buf);
+ NSURL *_fileUrl;
+ NSDictionary *_settings;
+ AVAudioMixerNode *_downMixer;
+ NSTimeInterval _startTime;
+ int _bufferSize;
+
+ @public
+ bool recording;
+ NSTimeInterval currentTime;
+}
+
+- (void)prepare:(NSURL*)recordingFileUrl bufferSize:(int)bufferSize settings:(NSDictionary*)settings handler:(void(^)(AVAudioPCMBuffer *))handler;
+- (void)start;
+- (void)pause;
+- (void)stop;
+
+@end
diff --git a/ios/StreamingModule.m b/ios/StreamingModule.m
new file mode 100644
index 00000000..9a524175
--- /dev/null
+++ b/ios/StreamingModule.m
@@ -0,0 +1,125 @@
+//
+// StreamingModule.c
+// RNAudio
+//
+// Created by JeungminOh on 30/05/2017.
+// Copyright © 2017 Joshua Sierles. All rights reserved.
+//
+
+#import "StreamingModule.h"
+
+@implementation StreamingModule
+
+- (void)prepare:(NSURL *)recordingFileUrl bufferSize:(int)bufferSize settings:(NSDictionary*)settings handler:(void(^)(AVAudioPCMBuffer *))handler {
+ _audioDataReceived = [handler copy];
+ _fileUrl = recordingFileUrl;
+ _settings = settings;
+ _bufferSize = bufferSize;
+
+ _engine = [[AVAudioEngine alloc] init];
+
+ AVAudioInputNode *input = [_engine inputNode];
+ _downMixer = [[AVAudioMixerNode alloc] init];
+ AVAudioMixerNode *mainMixer = [_engine mainMixerNode];
+
+ NSLog(@"Prepare");
+ NSLog(@"%@", [settings description]);
+
+
+ AVAudioFormat *pcmFloat32Format =
+ [[AVAudioFormat alloc] initWithCommonFormat: AVAudioPCMFormatFloat32
+ sampleRate: [_settings[AVSampleRateKey] doubleValue]
+ channels: [_settings[AVNumberOfChannelsKey] intValue]
+ interleaved: NO
+ ];
+
+ AVAudioFormat *pcmInt16Format =
+ [[AVAudioFormat alloc] initWithCommonFormat: AVAudioPCMFormatInt16
+ sampleRate: [_settings[AVSampleRateKey] doubleValue]
+ channels: [_settings[AVNumberOfChannelsKey] intValue]
+ interleaved: NO
+ ];
+
+ NSLog(@"%@", [pcmFloat32Format description]);
+
+ [_engine attachNode:_downMixer];
+ [_engine connect:input to:_downMixer format:[input inputFormatForBus:0]];
+ [_downMixer setVolume:0];
+ [_engine connect:_downMixer to:mainMixer format:pcmFloat32Format];
+
+ NSError *error = nil;
+ AVAudioFile *file = [[AVAudioFile alloc] initForWriting:_fileUrl
+ settings:_settings
+ commonFormat:AVAudioPCMFormatInt16
+ interleaved:NO
+ error:&error];
+
+ NSLog(@"InstallTapOnBus");
+
+ [_downMixer installTapOnBus: 0 bufferSize: _bufferSize format: pcmFloat32Format block: ^(AVAudioPCMBuffer *buf, AVAudioTime *when) {
+ // ‘buf' contains audio captured from input node at time 'when'
+ currentTime = when.sampleTime / when.sampleRate - _startTime;
+
+ // convert AVAudioPCMFormatFloat32 to AVAudioPCMFormatInt16
+ AVAudioPCMBuffer *pcmInt16Buffer = [[AVAudioPCMBuffer alloc] initWithPCMFormat:pcmInt16Format
+ frameCapacity:[buf frameCapacity]];
+
+ [pcmInt16Buffer setFrameLength: [buf frameLength]];
+
+ for (int channel=0; channel
+#include
+#include
+#include
+
+
+/*
+ * This speech algorithm looks at multiple auditory compenents related to speech:
+ * - Energy divided into 1 KHz bands
+ * - Dominant Frequency Component
+ * - Spectral Flatness Measure
+ * - Zero-crossings
+ *
+ * If many features of speech are present for a period of time (~150 ms), speech is detected.
+ * The end of speech is determined by most features of speech disappearing for an extended period of time (~1 sec)
+ */
+
+#define DETECTOR_CVAD_FRAMES_INIT 40 /* number of frames to use to initialize values */
+#define DETECTOR_CVAD_E_TH_COEFF_LOW_BAND 2.5f /* Energy threshold coefficient */
+#define DETECTOR_CVAD_E_TH_COEFF_UPPER_BANDS 2.0f /* Energy threshold coefficient */
+#define DETECTOR_CVAD_SFM_TH 3.0f /* Spectral Flatness Measure threshold */
+#define DETECTOR_CVAD_DFC_TH 250.0f /* most Dominant Frequency Component threshold */
+#define DETECTOR_CVAD_MIN_ZERO_CROSSINGS 5 /* fewest zero crossings for speech */
+#define DETECTOR_CVAD_MAX_ZERO_CROSSINGS 15 /* maximum zero crossings for speech */
+#define DETECTOR_CVAD_RESULT_MEMORY 130 /* number of frame results to keep in memory */
+#define DETECTOR_CVAD_ENERGY_MEMORY 20 /* number of frame results to keep in memory */
+#define DETECTOR_CVAD_N_ENERGY_BANDS 5 /* number of 1 KHz energy bands to compute */
+#define DETECTOR_CVAD_MINIMUM_LENGTH 1000 /* minimum length of vad in ms */
+
+//final speech detection variables
+#define DETECTOR_CVAD_N_FRAMES_CHECK_START 15
+#define DETECTOR_CVAD_COUNT_SUM_START 4.5*DETECTOR_CVAD_N_FRAMES_CHECK_START
+#define DETECTOR_CVAD_COUNT_SUM_START_SENSITIVE 3.8*DETECTOR_CVAD_N_FRAMES_CHECK_START
+#define DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT 1.5*DETECTOR_CVAD_N_FRAMES_CHECK_START
+#define DETECTOR_CVAD_COUNT_END_SHORT_FACTOR 0.6
+#define DETECTOR_CVAD_COUNT_END_SHORT_FACTOR_SENSITIVE 0.3
+#define DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG 6.5*DETECTOR_CVAD_N_FRAMES_CHECK_START
+#define DETECTOR_CVAD_COUNT_END_LONG_FACTOR 1.8
+#define DETECTOR_CVAD_COUNT_END_LONG_FACTOR_SENSITIVE 1.5
+
+typedef struct {
+ double energy_thresh_coeff_lower;
+ double energy_thresh_coeff_upper;
+ double sfm_thresh;
+ double dfc_thresh;
+ double th_energy[DETECTOR_CVAD_N_ENERGY_BANDS];
+ double th_sfm;
+ double th_dfc;
+ double ref_energy[DETECTOR_CVAD_N_ENERGY_BANDS];
+ double ref_sfm;
+ double ref_dfc;
+ double ref_dfc_var;
+ double energy_update_coeff[DETECTOR_CVAD_N_ENERGY_BANDS];
+ double energy_prev_variance[DETECTOR_CVAD_N_ENERGY_BANDS];
+ double energy_history[DETECTOR_CVAD_N_ENERGY_BANDS][DETECTOR_CVAD_ENERGY_MEMORY];
+ double sfm_update_coeff;
+ double dfc_history[DETECTOR_CVAD_FRAMES_INIT];
+ double dfc_update_coeff;
+ float end_sum_long_coeff;
+ float end_sum_short_coeff;
+ int frame_number;
+ int speech_start_frame;
+ int max_speech_time;
+ int energy_history_index;
+ int min_zero_crossings;
+ int max_zero_crossings;
+ int thresh_initialized;
+ int silence_count;
+ int talking;
+ int sample_freq;
+ int samples_per_frame;
+ int max_start_sum;
+ int n_frames_check_start;
+ int n_frames_check_end_short;
+ int n_frames_check_end_long;
+ int start_sum_threshold;
+ int previous_state_index;
+ short int previous_state[DETECTOR_CVAD_RESULT_MEMORY];
+} s_wv_detector_cvad_state;
+
+/*
+ Main entry point to the detection algorithm.
+ This returns a -1 if there is no change in state, a 1 if some started talking, and a 0 if speech ended
+ */
+int wvs_cvad_detect_talking(s_wv_detector_cvad_state *cvad_state, short int *samples, float *fft_mags);
+
+
+/*
+ Initiate the cvad_state structure, which represents the state of
+ one instance of the algorithm
+
+ sensitive mode: 0 if for a close-up mic, 1 if for a fixed, distant mic
+ */
+s_wv_detector_cvad_state* wv_detector_cvad_init(int sample_rate, int sensitivity, int speech_timeout);
+
+/*
+ Safely frees memory for a cvad_state
+ */
+void wv_detector_cvad_clean(s_wv_detector_cvad_state *cvad_state);
+
+/*
+ Set VAD sensitivity (0-100)
+ - Lower values are for strong voice signals like for a cellphone or personal mic
+ - Higher values are for use with a fixed-position mic or any application with voice burried in ambient noise
+ - Defaults to 0
+ */
+
+void wv_detector_cvad_set_sensitivity(s_wv_detector_cvad_state *cvad_state, int sensitivity);
+
+/*
+ Set the reference values of the energy, most dominant frequency componant and the spectral flatness measure.
+ The threshold value is then set based on the "background" reference levels
+ */
+void wv_detector_cvad_update_ref_levels(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm);
+
+/*
+ Set the threshhold on the cvad_state.
+ */
+void vw_detector_cvad_set_threshold(s_wv_detector_cvad_state *cvad_state);
+
+/*
+ Computes the variance of the energy over the past few windows and adapts the update ceoffs accordingly
+ */
+void wv_detector_cvad_modify_update_coeffs(s_wv_detector_cvad_state *cvad_state);
+
+/*
+ Compare the distance between the value and the minimum value of each component and return how many
+ component(s) reponded positiviely.
+ Each frame with more than 2 (out of 3) matching features are qualified as a speech frame.
+ example : energy - cvad_state->min_energy > cvad_state->th_energy
+ */
+short int vw_detector_cvad_check_frame(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm, int zero_crossings);
+
+/*
+ Return the frequency with the biggest amplitude (from a frame).
+ */
+double frames_detector_cvad_most_dominant_freq(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double nb_samples);
+
+/*
+ Computes the energy of the first DETECTOR_CVAD_N_ENERGY_BANDS 1 KHz bands
+ */
+void frames_detector_cvad_multiband_energy(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double *band_energy, int nb_samples);
+
+/*
+ Compute the spectral flatness of a frame.
+ It tells us if all the frequencies have a similar amplitude, which would means noise
+ or if there is some dominant frequencies, which could mean voice.
+ */
+double frames_detector_cvad_spectral_flatness(float *fft_mags, int nb);
+
+/*
+ Counts the number of times the signal crosses zero
+ Even soft vocalizations have a fairly regular number of zero crossings (~5-15 for 10ms)
+ */
+int frames_detector_cvad_zero_crossings(short int *samples, int nb);
+
+#endif
diff --git a/ios/WITCvad.m b/ios/WITCvad.m
new file mode 100644
index 00000000..91eabf30
--- /dev/null
+++ b/ios/WITCvad.m
@@ -0,0 +1,357 @@
+//
+// WITCvad.m
+// Wit
+//
+// Created by Anthony Kesich on 11/12/14.
+// Copyright (c) 2014 Willy Blandin. All rights reserved.
+//
+
+#include "WITCvad.h"
+
+
+/*
+ Adds value to the head of memory
+ */
+static void frame_memory_push(s_wv_detector_cvad_state *cvad_state, short int value);
+
+/*
+ Sums up the last N values of memory
+ */
+static int frame_memory_sum_last_n(s_wv_detector_cvad_state *cvad_state, int nb);
+
+
+int wvs_cvad_detect_talking(s_wv_detector_cvad_state *cvad_state, short int *samples, float *fft_mags)
+{
+ double dfc;
+ double band_energy[DETECTOR_CVAD_N_ENERGY_BANDS];
+ double sfm;
+ int fft_size = pow(2,floor(log2(cvad_state->samples_per_frame)));
+ short int counter;
+ int action = -1;
+ int zero_crossings;
+
+ //only process cvad_state->samples_per_frame samples at a time
+ //frames_detector_cvad_fft(samples, fft_modules, cvad_state->samples_per_frame);
+ dfc = frames_detector_cvad_most_dominant_freq(cvad_state, fft_mags, fft_size, cvad_state->samples_per_frame);
+ sfm = frames_detector_cvad_spectral_flatness(fft_mags, fft_size);
+ zero_crossings = frames_detector_cvad_zero_crossings(samples, cvad_state->samples_per_frame);
+ frames_detector_cvad_multiband_energy(cvad_state, fft_mags, fft_size, band_energy, cvad_state->samples_per_frame);
+
+ vw_detector_cvad_set_threshold(cvad_state);
+ counter = vw_detector_cvad_check_frame(cvad_state, band_energy, dfc, sfm, zero_crossings);
+ frame_memory_push(cvad_state, counter);
+
+ if ((counter < 3 && cvad_state->talking == 0) || !cvad_state->thresh_initialized) {
+ cvad_state->silence_count++;
+ //only update reference levels if we don't detect speech
+ wv_detector_cvad_update_ref_levels(cvad_state, band_energy, dfc, sfm);
+ }
+ if (cvad_state->thresh_initialized) {
+ int start_sum = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_START);
+ int stop_sum_long = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG);
+ int stop_sum_short = frame_memory_sum_last_n(cvad_state, DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT);
+ int speech_time = (cvad_state->frame_number-cvad_state->speech_start_frame) * cvad_state->samples_per_frame * 1000 / cvad_state->sample_freq;
+
+ if(start_sum > cvad_state->max_start_sum){
+ cvad_state->max_start_sum = start_sum;
+ }
+ if (!cvad_state->talking && start_sum >= cvad_state->start_sum_threshold ) {
+ cvad_state->talking = 1;
+ cvad_state->speech_start_frame = cvad_state->frame_number;
+ action = 1;
+ }
+ else if (cvad_state->talking && speech_time > DETECTOR_CVAD_MINIMUM_LENGTH
+ && ((counter < 3
+ && stop_sum_long <= cvad_state->max_start_sum*cvad_state->end_sum_long_coeff
+ && stop_sum_short <= cvad_state->max_start_sum*cvad_state->end_sum_short_coeff)
+ || (cvad_state->max_speech_time > 0
+ && speech_time >= cvad_state->max_speech_time))) {
+ cvad_state->talking = 0;
+ action = 0;
+ cvad_state->max_start_sum = 0;
+ }
+ }
+
+ cvad_state->frame_number++;
+
+ return action;
+}
+
+s_wv_detector_cvad_state* wv_detector_cvad_init(int sample_rate, int sensitivity, int speech_timeout)
+{
+ s_wv_detector_cvad_state *cvad_state = malloc(sizeof(s_wv_detector_cvad_state));
+ cvad_state->energy_thresh_coeff_lower = DETECTOR_CVAD_E_TH_COEFF_LOW_BAND;
+ cvad_state->energy_thresh_coeff_upper = DETECTOR_CVAD_E_TH_COEFF_UPPER_BANDS;
+ cvad_state->sfm_thresh= DETECTOR_CVAD_SFM_TH;
+ cvad_state->dfc_thresh= DETECTOR_CVAD_DFC_TH;
+ cvad_state->min_zero_crossings= DETECTOR_CVAD_MIN_ZERO_CROSSINGS;
+ cvad_state->max_zero_crossings= DETECTOR_CVAD_MAX_ZERO_CROSSINGS;
+ memset(cvad_state->energy_update_coeff, 0.20, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double));
+ memset(cvad_state->energy_prev_variance, -1, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double));
+ memset(cvad_state->energy_history, 0, DETECTOR_CVAD_ENERGY_MEMORY * DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double));
+ cvad_state->energy_history_index = 0;
+ cvad_state->dfc_update_coeff = 0.10;
+ cvad_state->sfm_update_coeff = 0.10;
+ cvad_state->frame_number = 0;
+ cvad_state->speech_start_frame = -1;
+ cvad_state->max_speech_time = speech_timeout;
+ cvad_state->thresh_initialized = 0;
+ cvad_state->silence_count = 0;
+ cvad_state->talking = 0;
+ memset(cvad_state->ref_energy, 0, DETECTOR_CVAD_N_ENERGY_BANDS * sizeof(double));
+ cvad_state->ref_dfc = 0;
+ cvad_state->ref_sfm = 0;
+ memset(cvad_state->dfc_history, 0, DETECTOR_CVAD_FRAMES_INIT * sizeof(double));
+ cvad_state->sample_freq = sample_rate;
+ cvad_state->max_start_sum = 0;
+ cvad_state->samples_per_frame = pow(2,ceil(log2(cvad_state->sample_freq/150))); //around 100 frames per second, but must be a power of two
+ cvad_state->previous_state_index = 0;
+ memset(cvad_state->previous_state, 0, DETECTOR_CVAD_RESULT_MEMORY * sizeof(short int));
+
+ wv_detector_cvad_set_sensitivity(cvad_state, sensitivity);
+
+ return cvad_state;
+}
+
+void wv_detector_cvad_clean(s_wv_detector_cvad_state *cvad_state)
+{
+ free(cvad_state);
+}
+
+void wv_detector_cvad_set_sensitivity(s_wv_detector_cvad_state *cvad_state, int sensitivity)
+{
+ float sensitivity_frac = fmax(0,fmin(100,sensitivity))/100.0;
+ cvad_state->n_frames_check_start=DETECTOR_CVAD_N_FRAMES_CHECK_START;
+ cvad_state->n_frames_check_end_short=DETECTOR_CVAD_N_FRAMES_CHECK_END_SHORT;
+ cvad_state->n_frames_check_end_long=DETECTOR_CVAD_N_FRAMES_CHECK_END_LONG;
+
+ cvad_state->start_sum_threshold = DETECTOR_CVAD_COUNT_SUM_START_SENSITIVE*sensitivity_frac;
+ cvad_state->start_sum_threshold += DETECTOR_CVAD_COUNT_SUM_START*(1-sensitivity_frac);
+
+ cvad_state->end_sum_short_coeff = DETECTOR_CVAD_COUNT_END_SHORT_FACTOR_SENSITIVE*sensitivity_frac;
+ cvad_state->end_sum_short_coeff += DETECTOR_CVAD_COUNT_END_SHORT_FACTOR*(1-sensitivity_frac);
+
+ cvad_state->end_sum_long_coeff = DETECTOR_CVAD_COUNT_END_LONG_FACTOR_SENSITIVE*sensitivity_frac;
+ cvad_state->end_sum_long_coeff += DETECTOR_CVAD_COUNT_END_LONG_FACTOR*(1-sensitivity_frac);
+}
+
+void wv_detector_cvad_update_ref_levels(s_wv_detector_cvad_state *cvad_state,
+ double *band_energy,
+ double dfc,
+ double sfm)
+{
+ int b=0;
+ if (!cvad_state->thresh_initialized) {
+ //if still initializing, accumulate values to average
+ for(b=0; bref_energy[b] += band_energy[b];
+ }
+
+
+ cvad_state->ref_sfm += sfm;
+
+ cvad_state->dfc_history[cvad_state->frame_number] = dfc > 0 ? log(dfc) : 0;
+ }
+
+ //record energy history
+ for(b=0; benergy_history[b][cvad_state->energy_history_index] = band_energy[b];
+ }
+ cvad_state->energy_history_index++;
+ cvad_state->energy_history_index%=DETECTOR_CVAD_ENERGY_MEMORY;
+
+ if (cvad_state->frame_number >= DETECTOR_CVAD_FRAMES_INIT) {
+ if(!cvad_state->thresh_initialized) {
+ //if done initializing, divide by number of samples to get an average
+ cvad_state->thresh_initialized = 1;
+ for(b=0; bref_energy[b] /= cvad_state->frame_number;
+ }
+
+ cvad_state->ref_sfm /= cvad_state->frame_number;
+
+ double sum = 0;
+ double sq_sum = 0;
+ for(b=0; bref_dfc+=cvad_state->dfc_history[b];
+ sum += cvad_state->dfc_history[b];
+ sq_sum += pow(cvad_state->dfc_history[b],2);
+ }
+ cvad_state->ref_dfc /= cvad_state->frame_number;
+ cvad_state->ref_dfc_var = (sq_sum-sum*sum/cvad_state->frame_number)/(cvad_state->frame_number -1);
+
+ } else if (cvad_state->talking == 0) {
+ //otherwise update thresholds based on adaptive rules if there's no speech
+ wv_detector_cvad_modify_update_coeffs(cvad_state);
+ for(b=0; bref_energy[b] *= (1-cvad_state->energy_update_coeff[b]);
+ cvad_state->ref_energy[b] += cvad_state->energy_update_coeff[b]*band_energy[b];
+ }
+
+ }
+ }
+
+}
+
+void vw_detector_cvad_set_threshold(s_wv_detector_cvad_state *cvad_state)
+{
+ //update thresholds to be a multiple of the reference level
+ int b;
+ cvad_state->th_energy[0] = cvad_state->ref_energy[0]*cvad_state->energy_thresh_coeff_lower;
+ for(b=1; bth_energy[b] = cvad_state->ref_energy[b]*cvad_state->energy_thresh_coeff_upper;
+ }
+ cvad_state->th_dfc = cvad_state->ref_dfc+cvad_state->dfc_thresh;
+ cvad_state->th_sfm = cvad_state->ref_sfm+cvad_state->sfm_thresh;
+}
+
+void wv_detector_cvad_modify_update_coeffs(s_wv_detector_cvad_state *cvad_state){
+ int b;
+ for(b=0; benergy_history[b][h];
+ sq_sum+=pow(cvad_state->energy_history[b][h],2);
+ }
+ double variance = (sq_sum-sum*sum/DETECTOR_CVAD_ENERGY_MEMORY)/(DETECTOR_CVAD_ENERGY_MEMORY-1);
+ double ratio = variance/cvad_state->energy_prev_variance[b];
+ if(ratio > 1.25){
+ cvad_state->energy_update_coeff[b] = 0.25;
+ } else if(ratio > 1.10){
+ cvad_state->energy_update_coeff[b] = 0.20;
+ } else if(ratio > 1.00){
+ cvad_state->energy_update_coeff[b] = 0.15;
+ } else if(ratio > 0.00){
+ cvad_state->energy_update_coeff[b] = 0.10;
+ } else {
+ //negative value indicates that this is the first pass of variance. Just set the coeff to 0.2
+ cvad_state->energy_update_coeff[b] = 0.20;
+ }
+ cvad_state->energy_prev_variance[b] = variance;
+ }
+}
+
+short int vw_detector_cvad_check_frame(s_wv_detector_cvad_state *cvad_state, double *band_energy, double dfc, double sfm, int zero_crossings)
+{
+ short int counter;
+
+ counter = 0;
+
+ int band_counter = 0;
+ if (band_energy[0] > cvad_state->th_energy[0]) {
+ counter += 2;
+ }
+
+ int b;
+ for(b=1; b cvad_state->th_energy[b]){
+ band_counter++;
+ }
+ }
+ if(band_counter >= 2){
+ counter+=2;
+ }
+
+ if (fabs((dfc > 0 ? log(dfc): 0) - cvad_state->ref_dfc) > cvad_state->ref_dfc_var) {
+ counter++;
+ }
+ if (sfm > cvad_state->th_sfm) {
+ counter++;
+ }
+ if(zero_crossings >= cvad_state->min_zero_crossings && zero_crossings <= cvad_state->max_zero_crossings){
+ counter++;
+ }
+
+ return counter;
+}
+
+
+double frames_detector_cvad_most_dominant_freq(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double nb_samples)
+{
+ double k = 0.0f;
+ double max = 0.0f;
+ double amplitude_minimum = 1.0f;
+ int i;
+
+ for (i = 0; i < nb_modules; i++) {
+ if (fft_mags[i] > max && fft_mags[i] > amplitude_minimum) {
+ max = fft_mags[i];
+ k = i;
+ }
+ }
+
+ return k * (double)cvad_state->sample_freq / (double)nb_samples;
+}
+
+void frames_detector_cvad_multiband_energy(s_wv_detector_cvad_state *cvad_state, float *fft_mags, int nb_modules, double *band_energy, int nb_samples){
+
+ int b = 0;
+ int k = 0;
+
+ for(b = 0; bsample_freq/nb_samples < 1000*(b+1)){
+ band_energy[b]+=fft_mags[k];
+ k++;
+ }
+ }
+
+}
+
+double frames_detector_cvad_spectral_flatness(float *fft_mags, int nb)
+{
+ double geo_mean = 0.0f;
+ double arithm_mean = 0.0f;
+ double sfm = 0.0f;
+ int i;
+
+ for (i = 0; i < nb; i++) {
+ if (fft_mags[i] != 0.0f) {
+ geo_mean += log(fft_mags[i]);
+ arithm_mean += fft_mags[i];
+ }
+ }
+ geo_mean = exp(geo_mean / (double) nb);
+ arithm_mean = arithm_mean / (double) nb;
+ sfm = 10 * log10(geo_mean / arithm_mean);
+ sfm = fabs(sfm);
+
+ return sfm;
+}
+
+int frames_detector_cvad_zero_crossings(short int *samples, int nb){
+ int num_zero_crossings = 0;
+ int i;
+
+ for(i=1; iprevious_state[cvad_state->previous_state_index] = value;
+ cvad_state->previous_state_index++;
+ cvad_state->previous_state_index%=DETECTOR_CVAD_RESULT_MEMORY;
+}
+
+static int frame_memory_sum_last_n(s_wv_detector_cvad_state *cvad_state, int nb)
+{
+ int i = 0;
+ int sum = 0;
+
+ for (i = 0; i < nb; i++) {
+ int indx = (cvad_state->previous_state_index - (i+1) + DETECTOR_CVAD_RESULT_MEMORY) % DETECTOR_CVAD_RESULT_MEMORY;
+ sum += cvad_state->previous_state[indx];
+ }
+
+ return sum;
+}
+
diff --git a/ios/WITVad.h b/ios/WITVad.h
new file mode 100644
index 00000000..c0c90610
--- /dev/null
+++ b/ios/WITVad.h
@@ -0,0 +1,32 @@
+//
+// WITVad.h
+// Wit
+//
+// Created by Aric Lasry on 8/6/14.
+// Copyright (c) 2014 Willy Blandin. All rights reserved.
+//
+
+#import
+#import
+#import "WITCvad.h"
+
+@protocol WITVadDelegate;
+
+@interface WITVad : NSObject
+
+@property (nonatomic, weak) id delegate;
+
+@property (nonatomic, assign) BOOL stoppedUsingVad;
+
+- (instancetype)initWithAudioSampleRate:(int)audioSampleRate vadSensitivity:(int)_vadSensitivity vadTimeout:(int)_vadTimeout;
+- (void)gotAudioSamples:(NSData *)samples;
+
+@end
+
+
+@protocol WITVadDelegate
+
+-(void) vadStartedTalking;
+-(void) vadStoppedTalking;
+
+@end
diff --git a/ios/WITVad.m b/ios/WITVad.m
new file mode 100644
index 00000000..d587bea2
--- /dev/null
+++ b/ios/WITVad.m
@@ -0,0 +1,123 @@
+//
+// WITVad.m
+// Wit
+//
+// Created by Aric Lasry on 8/6/14.
+// Copyright (c) 2014 Willy Blandin. All rights reserved.
+//
+
+#import "WITVad.h"
+
+@implementation WITVad {
+ s_wv_detector_cvad_state *vad_state;
+ FFTSetup fft_setup;
+}
+
+- (void)gotAudioSamples:(NSData *)samples {
+ UInt32 size = (UInt32)[samples length];
+ short *bytes = (short*)[samples bytes];
+
+ for(int sample_offset=0; sample_offset+self->vad_state->samples_per_frame < size/2; sample_offset+=self->vad_state->samples_per_frame){
+
+ int nonZero=0;
+
+ //check to make sure buffer actually has audio data
+ for(int i=0; ivad_state->samples_per_frame; i++){
+ if(bytes[sample_offset+i] != 0){
+ nonZero=1;
+ break;
+ }
+ }
+
+ //skip frame if it has nothing
+ if(!nonZero) continue;
+
+ float *fft_mags = [self get_fft:(bytes+sample_offset)];
+
+ int detected_speech = wvs_cvad_detect_talking(self->vad_state, bytes+sample_offset, fft_mags);
+
+ free(fft_mags);
+
+ if ( detected_speech == 1){
+ //someone just started talking
+ NSLog(@"start talking...");
+ // debug(@"Starting......................")
+ dispatch_async(dispatch_get_main_queue(), ^{
+ [self.delegate vadStartedTalking];
+ });
+ } else if ( detected_speech == 0) {
+ //someone just stopped talking
+ NSLog(@"stop talking...");
+ // debug(@"Stopping......................");
+ self.stoppedUsingVad = YES;
+ dispatch_async(dispatch_get_main_queue(), ^{
+ [self.delegate vadStoppedTalking];
+ });
+ break;
+ }
+ }
+
+}
+
+- (instancetype)initWithAudioSampleRate:(int)audioSampleRate vadSensitivity:(int)_vadSensitivity vadTimeout:(int)_vadTimeout {
+ // debug(@"WITVad init");
+ self = [super init];
+ if (!self) {
+ return nil;
+ }
+ int vadSensitivity = (int)fmin(100,fmax(0,_vadSensitivity)); //must be between 0 and 100
+ int vadTimeout = (int)_vadTimeout;
+
+ self->vad_state = wv_detector_cvad_init(audioSampleRate,vadSensitivity,vadTimeout);
+ self.stoppedUsingVad = NO;
+
+ //get the next power of 2 that'll fit our data
+ int logN = log2(self->vad_state->samples_per_frame); //samples_per_frame will be a power of 2
+ //store the FFT setup for many later uses
+ self->fft_setup = vDSP_create_fftsetup(logN, kFFTRadix2);
+
+ return self;
+}
+
+- (void)dealloc {
+ // debug(@"Clean WITVad");
+ wv_detector_cvad_clean(self->vad_state);
+}
+
+- (float*)get_fft:(short *)samples {
+ int N = self->vad_state->samples_per_frame; //guarenteed to be a power of 2
+
+ //dynamically allocate an array for our results since we don't want to mutate the input samples
+ float *fft_mags = malloc(N/2 * sizeof(float));
+ float *fsamples = malloc(N * sizeof(float));
+
+ for(int i=0; ivad_state->samples_per_frame){
+ fsamples[i] = samples[i];
+ } else {
+ fsamples[i] = 0;
+ }
+ }
+
+ DSPSplitComplex tempSplitComplex;
+ tempSplitComplex.realp = malloc(N/2 * sizeof(float));
+ tempSplitComplex.imagp = malloc(N/2 * sizeof(float));
+
+ //pack the real data into a split form for accelerate
+ vDSP_ctoz((DSPComplex*)fsamples, 2, &tempSplitComplex, 1, N/2);
+
+ //do the FFT
+ vDSP_fft_zrip(self->fft_setup, &tempSplitComplex, 1, (int)log2(N), kFFTDirection_Forward);
+
+ //get the magnitudes
+ vDSP_zvabs(&tempSplitComplex, 1, fft_mags, 1, N/2);
+
+ //clear up memory
+ free(fsamples);
+ free(tempSplitComplex.realp);
+ free(tempSplitComplex.imagp);
+
+ return fft_mags;
+}
+
+@end
diff --git a/package.json b/package.json
index e2bbbe5a..96a9d064 100644
--- a/package.json
+++ b/package.json
@@ -5,9 +5,7 @@
"main": "index.js",
"author": "Joshua Sierles (https://github.com/jsierles)",
"files": [
- "ios/AudioRecorderManager.m",
- "ios/AudioRecorderManager.h",
- "ios/RNAudio.xcodeproj",
+ "ios/*",
"README.md",
"LICENSE",
"index.js",