Sound-Sample-Classifier-AI/read_wave.py at master · davidliii/Sound-Sample-Classifier-AI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import struct as st
import numpy as np
import math

def read_wave(path, normalize=True, length=1, threshold=0.001):
    '''
    Function to parse wave file data.
    Inputs: path (string)    - full filepath to audio file
            normalize (bool) - set true to apply a normalization multiplier across audio frames
                               so that frame data ranges between [-1, 1]
            length (float)   - chops audio sample to meet this length (seconds). If audio sample is shorter than
                               length, it will be unchanged


    Outputs: data (list)  - array containing audio data values averaged between the left and right channels
             sampleRate (int) - 1 / sampleRate gives the time seperation between data values. sampleRate / 2 gives the
                                nyquist frequency of the discrete signal
    Return values are None, None if:
        1. File open error
        2. File not does meet the following specification:
            a. Correctly formed header information (chunk ids, formats, etc.)
            b. File is uncompressed (PCM = 1)
            c. File is stereo
    '''

    ''' HELPER FUNCTIONS START'''
    def _findDataStart(fileContent):
        start = 36
        while start < len(fileContent):
            if (fileContent[start:start+4] == b'data'):
                return start
            else:
                start += 1
        return None

    def _normalizeData(data):
        maxVal = abs(max(data, key=abs))
        multiplier = 1 / maxVal

        normalizedData = [x * multiplier for x in data]
        return normalizedData

    def _convertBinaryStringToInt(bString):
        return int.from_bytes(bString, byteorder="little", signed=True)

    def _extractData(dataString, bytesPerSample):
        size = st.unpack('<L', dataString[4:8])[0]
        pos = 8
        data = []

        while pos < size:
            leftData = _convertBinaryStringToInt(dataString[pos:pos+bytesPerSample])
            pos += bytesPerSample
            rightData = _convertBinaryStringToInt(dataString[pos:pos+bytesPerSample])
            pos += bytesPerSample
            data.append((leftData + rightData) / 2)

        return data

    def _applyLength(signal, sampleRate, duration):
        numFramesDesired = math.floor(sampleRate * duration)
        currNumFrames = len(signal)

        if currNumFrames > numFramesDesired:
            signal = signal[:numFramesDesired]

        return signal

    def _trimData(signal, threshold):
        windowSize = 100
        toTrim = [] # list of indeces to trim out of the signal

        absSignal = [abs(x) for x in signal]

        for i in range(len(absSignal)-windowSize):
            avg = sum(absSignal[i:i+windowSize]) / windowSize
            if avg < threshold:
                toTrim.append(i)

        for i in range(len(absSignal)-windowSize, len(absSignal)):
            avg = sum(absSignal[i:]) / len(signal[i:])
            if avg < threshold:
                toTrim.append(i)

        for idx in sorted(toTrim, reverse=True):
            del signal[idx]

        return signal
    ''' HELPER FUNCTIONS END'''

    try:
        with open(path, mode='rb') as file:
            fileContent = file.read()
    except:
        print("Cannot open file at: " + path)
        return None, None

    chunkID       = fileContent[0:4].decode('ascii') # Should read 'RIFF'
    format        = fileContent[8:12].decode('ascii') # Should read 'WAVE'
    subChunk1ID   = fileContent[12:16].decode('ascii') # Should be 'fmt '
    audioFormat   = st.unpack('<H', fileContent[20:22])[0] # 1 for PCM, NOTE: only handle this for now
    numChannels   = st.unpack('<H', fileContent[22:24])[0] # 2 for stereo, 1 for mono, if it is mono then left and right channel
                                                           # data are duplicated in each sample frame
    sampleRate    = st.unpack('<L', fileContent[24:28])[0]

    bitsPerSample = st.unpack('<H', fileContent[34:36])[0]

    if chunkID != 'RIFF' or format != 'WAVE' or subChunk1ID != 'fmt ' or audioFormat != 1 or (numChannels != 1 and numChannels != 2):
        print("File format issues at: " + path)
        print("Make sure file is not compressed")
        return None, None

    dataStart = _findDataStart(fileContent)
    if dataStart == None:
        print("File data could not be retrived at: " + path)
        return None, None

    bytesPerSample = bitsPerSample / 8
    if bytesPerSample.is_integer() == False:
        print("Imcompatiable bytes per sample at: " + path)
        return None, None

    data = _extractData(fileContent[dataStart:], int(bytesPerSample))

    if normalize:
        data = _normalizeData(data)

    if threshold is not None:
        data = _trimData(data, threshold)

    if length is not None:
        data = _applyLength(data, sampleRate, length)

    return data, sampleRate