In [13]: import glob

import os
import librosa
import matplotlib.pyplot as plt
import tensorflow.compat.v1 as tf
from tensorflow.python.ops import rnn, rnn_cell
import numpy as np
%matplotlib inline'ggplot')


frames = 41
bands = 20


In [14]: def sliding_window(data, window_size):

start = 0
while start < len(data):
yield int(start), int(start + window_size)
start += (window_size / 2)

def extract_features(parent_dir, sub_dirs, bands = 20, frames = 41, file_ext="*.wav"):

window_size = 512 * (frames - 1)
mfccs = []
labels = []

for l, sub_dir in enumerate(sub_dirs):

for filename in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)):
audio, Fs = librosa.load(filename)
label = filename.split('/')[2].split('-')[1]

for (start,end) in sliding_window(audio, window_size):

if(len(audio[start:end]) == window_size):
mfcc = librosa.feature.mfcc(y = audio[start:end], sr = Fs, n_mfcc = bands

features = np.asarray(mfccs).reshape(len(mfccs), frames, bands)

return np.array(features), np.array(labels, dtype =

def one_hot_encode(labels):
n_labels = len(labels)

n_unique_labels = len(np.unique(labels))
one_hot_encode = np.zeros((n_labels, n_unique_labels))

one_hot_encode[np.arange(n_labels), labels] = 1

return one_hot_encode

In [15]: parent_dir = 'Sound-Data'

tr_sub_dirs = ['fold1', 'fold3']

ts_sub_dirs = ['fold2', 'fold4']

tr_features, tr_labels = extract_features(parent_dir, tr_sub_dirs, bands, frames)

tr_labels = one_hot_encode(tr_labels)

ts_features, ts_labels = extract_features(parent_dir, ts_sub_dirs, bands, frames)

ts_labels = one_hot_encode(ts_labels)

In [19]: tf.reset_default_graph()

batch_size = 50
display_step = 200

# Network Parameters
n_input = bands
n_steps = frames
n_hidden = 320
n_classes = 10

learning_rate = 0.01
training_iterations = 4000

x = tf.placeholder("float", [None, n_steps, n_input])

y = tf.placeholder("float", [None, n_classes])

weight = tf.Variable(tf.random_normal([n_hidden, n_classes]))

bias = tf.Variable(tf.random_normal([n_classes]))

In [20]: def RNN(x, weight, bias):

cell = rnn_cell.LSTMCell(n_hidden, state_is_tuple = True)
cell = rnn_cell.MultiRNNCell([cell] * 1)
output, state = tf.nn.dynamic_rnn(cell, x, dtype = tf.float32)
output = tf.transpose(output, [1, 0, 2])
last = tf.gather(output, int(output.get_shape()[0]) - 1)

return tf.nn.softmax(tf.matmul(last, weight) + bias)

In [21]: prediction = RNN(x, weight, bias)

# Define loss and optimizer

loss_f = -tf.reduce_sum(y * tf.log(prediction))
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss_f)

# Evaluate model
correct_pred = tf.equal(tf.argmax(prediction,1), tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# Initializing the variables

init = tf.global_variables_initializer()

In [*]: with tf.Session() as session:

for epoch in range(training_iterations):

offset = (epoch * batch_size) % (tr_labels.shape[0] - batch_size)
batch_x = tr_features[offset:(offset + batch_size), :, :]
batch_y = tr_labels[offset:(offset + batch_size), :]
_, c =[optimizer, loss_f], feed_dict = {x: batch_x, y : batch_y})

if epoch % display_step == 0:
# Calculate batch accuracy
acc =, feed_dict={x: batch_x, y: batch_y})
# Calculate batch loss
loss =, feed_dict={x: batch_x, y: batch_y})
print("Iter " + str(epoch) + ", Minibatch Loss= " + \
"{:.6f}".format(loss) + ", Training Accuracy= " + \

print('Test accuracy: ',round(, feed_dict={x: ts_features, y: ts_labe

Iter 0, Minibatch Loss= 217.047424, Training Accuracy= 0.60000

Iter 200, Minibatch Loss= 68.556366, Training Accuracy= 0.50000
Iter 400, Minibatch Loss= 44.655693, Training Accuracy= 0.66000
Iter 600, Minibatch Loss= 61.274078, Training Accuracy= 0.50000
Iter 800, Minibatch Loss= 41.406906, Training Accuracy= 0.76000
Iter 1000, Minibatch Loss= 63.801964, Training Accuracy= 0.56000
Iter 1200, Minibatch Loss= 43.403629, Training Accuracy= 0.74000
Iter 1400, Minibatch Loss= 34.109589, Training Accuracy= 0.80000
Iter 1600, Minibatch Loss= 43.974281, Training Accuracy= 0.64000
Iter 1800, Minibatch Loss= 52.404976, Training Accuracy= 0.60000
Iter 2000, Minibatch Loss= 43.137421, Training Accuracy= 0.74000
Iter 2200, Minibatch Loss= 30.857113, Training Accuracy= 0.70000
Iter 2400, Minibatch Loss= 22.638533, Training Accuracy= 0.78000
Iter 2600, Minibatch Loss= 31.792271, Training Accuracy= 0.86000
Iter 2800, Minibatch Loss= 48.228432, Training Accuracy= 0.72000
Iter 3000, Minibatch Loss= 43.207615, Training Accuracy= 0.70000
Iter 3200, Minibatch Loss= 20.295015, Training Accuracy= 0.84000
Iter 3400, Minibatch Loss= 26.592484, Training Accuracy= 0.88000
Iter 3600, Minibatch Loss= 54.418182, Training Accuracy= 0.64000
Iter 3800, Minibatch Loss= 38.426224, Training Accuracy= 0.70000

In [ ]:

