Mur*_*nat 31 python audio machine-learning theano keras
我有几千个音频文件,我想用Keras和Theano对它们进行分类.到目前为止,我生成了每个音频文件的28x28频谱图(更大可能更好,但我只是想让算法工作),并将图像读入矩阵.所以最后我将这个大图像矩阵输入网络进行图像分类.
在教程中,我发现了这个mnist分类代码:
import numpy as np
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense
from keras.utils import np_utils
batch_size = 128
nb_classes = 10
nb_epochs = 2
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(60000, 784)
X_test = X_test.reshape(10000, 784)
X_train = X_train.astype("float32")
X_test = X_test.astype("float32")
X_train /= 255
X_test /= 255
print(X_train.shape[0], "train samples")
print(X_test.shape[0], "test samples")
y_train = np_utils.to_categorical(y_train, nb_classes)
y_test = np_utils.to_categorical(y_test, nb_classes)
model = Sequential()
model.add(Dense(output_dim = 100, input_dim = 784, activation= "relu"))
model.add(Dense(output_dim = 200, activation = "relu"))
model.add(Dense(output_dim = 200, activation = "relu"))
model.add(Dense(output_dim = nb_classes, activation = "softmax"))
model.compile(optimizer = "adam", loss = "categorical_crossentropy")
model.fit(X_train, y_train, batch_size = batch_size, nb_epoch = nb_epochs, show_accuracy = True, verbose = 2, validation_data = (X_test, y_test))
score = model.evaluate(X_test, y_test, show_accuracy = True, verbose = 0)
print("Test score: ", score[0])
print("Test accuracy: ", score[1])
Run Code Online (Sandbox Code Playgroud)
这段代码运行,我得到了预期的结果:
(60000L, 'train samples')
(10000L, 'test samples')
Train on 60000 samples, validate on 10000 samples
Epoch 1/2
2s - loss: 0.2988 - acc: 0.9131 - val_loss: 0.1314 - val_acc: 0.9607
Epoch 2/2
2s - loss: 0.1144 - acc: 0.9651 - val_loss: 0.0995 - val_acc: 0.9673
('Test score: ', 0.099454972004890438)
('Test accuracy: ', 0.96730000000000005)
Run Code Online (Sandbox Code Playgroud)
到目前为止,一切都运行良好,但是当我将上述算法应用于我的数据集时,准确性就会卡住.
我的代码如下:
import os
import pandas as pd
from sklearn.cross_validation import train_test_split
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.utils import np_utils
import AudioProcessing as ap
import ImageTools as it
batch_size = 128
nb_classes = 2
nb_epoch = 10
for i in range(20):
print "\n"
# Generate spectrograms if necessary
if(len(os.listdir("./AudioNormalPathalogicClassification/Image")) > 0):
print "Audio files are already processed. Skipping..."
else:
print "Generating spectrograms for the audio files..."
ap.audio_2_image("./AudioNormalPathalogicClassification/Audio/","./AudioNormalPathalogicClassification/Image/",".wav",".png",(28,28))
# Read the result csv
df = pd.read_csv('./AudioNormalPathalogicClassification/Result/result.csv', header = None)
df.columns = ["RegionName","IsNormal"]
bool_mapping = {True : 1, False : 0}
nb_classes = 2
for col in df:
if(col == "RegionName"):
a = 3
else:
df[col] = df[col].map(bool_mapping)
y = df.iloc[:,1:].values
y = np_utils.to_categorical(y, nb_classes)
# Load images into memory
print "Loading images into memory..."
X = it.load_images("./AudioNormalPathalogicClassification/Image/",".png")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
X_train = X_train.reshape(X_train.shape[0], 784)
X_test = X_test.reshape(X_test.shape[0], 784)
X_train = X_train.astype("float32")
X_test = X_test.astype("float32")
X_train /= 255
X_test /= 255
print("X_train shape: " + str(X_train.shape))
print(str(X_train.shape[0]) + " train samples")
print(str(X_test.shape[0]) + " test samples")
model = Sequential()
model.add(Dense(output_dim = 100, input_dim = 784, activation= "relu"))
model.add(Dense(output_dim = 200, activation = "relu"))
model.add(Dense(output_dim = 200, activation = "relu"))
model.add(Dense(output_dim = nb_classes, activation = "softmax"))
model.compile(loss = "categorical_crossentropy", optimizer = "adam")
print model.summary()
model.fit(X_train, y_train, batch_size = batch_size, nb_epoch = nb_epoch, show_accuracy = True, verbose = 1, validation_data = (X_test, y_test))
score = model.evaluate(X_test, y_test, show_accuracy = True, verbose = 1)
print("Test score: ", score[0])
print("Test accuracy: ", score[1])
Run Code Online (Sandbox Code Playgroud)
AudioProcessing.py
import os
import scipy as sp
import scipy.io.wavfile as wav
import matplotlib.pylab as pylab
import Image
def save_spectrogram_scipy(source_filename, destination_filename, size):
dt = 0.0005
NFFT = 1024
Fs = int(1.0/dt)
fs, audio = wav.read(source_filename)
if(len(audio.shape) >= 2):
audio = sp.mean(audio, axis = 1)
fig = pylab.figure()
ax = pylab.Axes(fig, [0,0,1,1])
ax.set_axis_off()
fig.add_axes(ax)
pylab.specgram(audio, NFFT = NFFT, Fs = Fs, noverlap = 900, cmap="gray")
pylab.savefig(destination_filename)
img = Image.open(destination_filename).convert("L")
img = img.resize(size)
img.save(destination_filename)
pylab.clf()
del img
def audio_2_image(source_directory, destination_directory, audio_extension, image_extension, size):
nb_files = len(os.listdir(source_directory));
count = 0
for file in os.listdir(source_directory):
if file.endswith(audio_extension):
destinationName = file[:-4]
save_spectrogram_scipy(source_directory + file, destination_directory + destinationName + image_extension, size)
count += 1
print ("Generating spectrogram for files " + str(count) + " / " + str(nb_files) + ".")
Run Code Online (Sandbox Code Playgroud)
ImageTools.py
import os
import numpy as np
import matplotlib.image as mpimg
def load_images(source_directory, image_extension):
image_matrix = []
nb_files = len(os.listdir(source_directory));
count = 0
for file in os.listdir(source_directory):
if file.endswith(image_extension):
with open(source_directory + file,"r+b") as f:
img = mpimg.imread(f)
img = img.flatten()
image_matrix.append(img)
del img
count += 1
#print ("File " + str(count) + " / " + str(nb_files) + " loaded.")
return np.asarray(image_matrix)
Run Code Online (Sandbox Code Playgroud)
所以我运行上面的代码并接受:
Audio files are already processed. Skipping...
Loading images into memory...
X_train shape: (2394L, 784L)
2394 train samples
1027 test samples
--------------------------------------------------------------------------------
Initial input shape: (None, 784)
--------------------------------------------------------------------------------
Layer (name) Output Shape Param #
--------------------------------------------------------------------------------
Dense (dense) (None, 100) 78500
Dense (dense) (None, 200) 20200
Dense (dense) (None, 200) 40200
Dense (dense) (None, 2) 402
--------------------------------------------------------------------------------
Total params: 139302
--------------------------------------------------------------------------------
None
Train on 2394 samples, validate on 1027 samples
Epoch 1/10
2394/2394 [==============================] - 0s - loss: 0.6898 - acc: 0.5455 - val_loss: 0.6835 - val_acc: 0.5716
Epoch 2/10
2394/2394 [==============================] - 0s - loss: 0.6879 - acc: 0.5522 - val_loss: 0.6901 - val_acc: 0.5716
Epoch 3/10
2394/2394 [==============================] - 0s - loss: 0.6880 - acc: 0.5522 - val_loss: 0.6842 - val_acc: 0.5716
Epoch 4/10
2394/2394 [==============================] - 0s - loss: 0.6883 - acc: 0.5522 - val_loss: 0.6829 - val_acc: 0.5716
Epoch 5/10
2394/2394 [==============================] - 0s - loss: 0.6885 - acc: 0.5522 - val_loss: 0.6836 - val_acc: 0.5716
Epoch 6/10
2394/2394 [==============================] - 0s - loss: 0.6887 - acc: 0.5522 - val_loss: 0.6832 - val_acc: 0.5716
Epoch 7/10
2394/2394 [==============================] - 0s - loss: 0.6882 - acc: 0.5522 - val_loss: 0.6859 - val_acc: 0.5716
Epoch 8/10
2394/2394 [==============================] - 0s - loss: 0.6882 - acc: 0.5522 - val_loss: 0.6849 - val_acc: 0.5716
Epoch 9/10
2394/2394 [==============================] - 0s - loss: 0.6885 - acc: 0.5522 - val_loss: 0.6836 - val_acc: 0.5716
Epoch 10/10
2394/2394 [==============================] - 0s - loss: 0.6877 - acc: 0.5522 - val_loss: 0.6849 - val_acc: 0.5716
1027/1027 [==============================] - 0s
('Test score: ', 0.68490593621422047)
('Test accuracy: ', 0.57156767283349563)
Run Code Online (Sandbox Code Playgroud)
我尝试改变网络,添加更多的纪元,但无论如何我都会得到相同的结果.我不明白为什么我得到相同的结果.
任何帮助,将不胜感激.谢谢.
编辑:我发现了一个错误,其中像素值未正确读取.我将下面的ImageTools.py修复为:
import os
import numpy as np
from scipy.misc import imread
def load_images(source_directory, image_extension):
image_matrix = []
nb_files = len(os.listdir(source_directory));
count = 0
for file in os.listdir(source_directory):
if file.endswith(image_extension):
with open(source_directory + file,"r+b") as f:
img = imread(f)
img = img.flatten()
image_matrix.append(img)
del img
count += 1
#print ("File " + str(count) + " / " + str(nb_files) + " loaded.")
return np.asarray(image_matrix)
Run Code Online (Sandbox Code Playgroud)
现在我实际上得到了从0到255的灰度像素值,所以现在将它除以255是有道理的.但是,我仍然得到相同的结果.
The*_*ube 34
最可能的原因是优化程序不适合您的数据集.以下是文档中的Keras优化器列表.
我建议您先使用默认参数值尝试SGD.如果它仍然不起作用,则将学习率除以10.如有必要,请进行几次.如果你的学习率达到1e-6并且它仍然不起作用,那么你还有另外一个问题.
总之,替换此行:
model.compile(loss = "categorical_crossentropy", optimizer = "adam")
Run Code Online (Sandbox Code Playgroud)
有了这个:
from keras.optimizers import SGD
opt = SGD(lr=0.01)
model.compile(loss = "categorical_crossentropy", optimizer = opt)
Run Code Online (Sandbox Code Playgroud)
如果不起作用,可以多次改变学习率.
如果是问题,你应该看到在几个时代之后损失越来越少.
cha*_*l-f 14
另一个我在这里没有看到但对我造成类似问题的解决方案是最后一个神经元的激活函数,特别是如果它relu不是像sigmoid.
换句话说,它可能会帮助您在最后一层使用非线性激活函数
最后一层:
model.add(keras.layers.Dense(1, activation='relu'))
Run Code Online (Sandbox Code Playgroud)
输出:
7996/7996 [==============================] - 1s 76us/sample - loss: 6.3474 - accuracy: 0.5860
Epoch 2/30
7996/7996 [==============================] - 0s 58us/sample - loss: 6.3473 - accuracy: 0.5860
Epoch 3/30
7996/7996 [==============================] - 0s 58us/sample - loss: 6.3473 - accuracy: 0.5860
Epoch 4/30
7996/7996 [==============================] - 0s 57us/sample - loss: 6.3473 - accuracy: 0.5860
Epoch 5/30
7996/7996 [==============================] - 0s 58us/sample - loss: 6.3473 - accuracy: 0.5860
Epoch 6/30
7996/7996 [==============================] - 0s 60us/sample - loss: 6.3473 - accuracy: 0.5860
Epoch 7/30
7996/7996 [==============================] - 0s 57us/sample - loss: 6.3473 - accuracy: 0.5860
Epoch 8/30
7996/7996 [==============================] - 0s 57us/sample - loss: 6.3473 - accuracy: 0.5860
Run Code Online (Sandbox Code Playgroud)
现在我使用了一个非线性激活函数:
model.add(keras.layers.Dense(1, activation='sigmoid'))
Run Code Online (Sandbox Code Playgroud)
输出:
7996/7996 [==============================] - 1s 74us/sample - loss: 0.7663 - accuracy: 0.5899
Epoch 2/30
7996/7996 [==============================] - 0s 59us/sample - loss: 0.6243 - accuracy: 0.5860
Epoch 3/30
7996/7996 [==============================] - 0s 56us/sample - loss: 0.5399 - accuracy: 0.7580
Epoch 4/30
7996/7996 [==============================] - 0s 56us/sample - loss: 0.4694 - accuracy: 0.7905
Epoch 5/30
7996/7996 [==============================] - 0s 57us/sample - loss: 0.4363 - accuracy: 0.8040
Epoch 6/30
7996/7996 [==============================] - 0s 60us/sample - loss: 0.4139 - accuracy: 0.8099
Epoch 7/30
7996/7996 [==============================] - 0s 58us/sample - loss: 0.3967 - accuracy: 0.8228
Epoch 8/30
7996/7996 [==============================] - 0s 61us/sample - loss: 0.3826 - accuracy: 0.8260
Run Code Online (Sandbox Code Playgroud)
这不是原始答案的直接解决方案,但由于搜索此问题时答案在 Google 上排名第一,因此可能会使某人受益。
如果准确率没有变化,则意味着优化器已经找到了损失的局部最小值。这可能是不希望的最小值。一种常见的局部最小值是始终预测具有最多数据点的类别。您应该对类使用权重来避免这个最小值。
from sklearn.utils import compute_class_weight
classWeight = compute_class_weight('balanced', outputLabels, outputs)
classWeight = dict(enumerate(classWeight))
model.fit(X_train, y_train, batch_size = batch_size, nb_epoch = nb_epochs, show_accuracy = True, verbose = 2, validation_data = (X_test, y_test), class_weight=classWeight)
Run Code Online (Sandbox Code Playgroud)
看看这个
sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile( loss = "categorical_crossentropy",
optimizer = sgd,
metrics=['accuracy']
)
Run Code Online (Sandbox Code Playgroud)
查看 文档
我在MNIST上取得了更好的结果
经过检查,我发现问题出在数据本身。由于在同一输入中有2个不同的输出,因此非常脏,因此造成混乱。清除数据后,现在我的精度提高到%69。仍然还不够好,但由于数据清楚了,至少我现在可以从这里开始。
我使用以下代码进行测试:
import os
import sys
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.utils import np_utils
sys.path.append("./")
import AudioProcessing as ap
import ImageTools as it
# input image dimensions
img_rows, img_cols = 28, 28
dim = 1
# number of convolutional filters to use
nb_filters = 32
# size of pooling area for max pooling
nb_pool = 2
# convolution kernel size
nb_conv = 3
batch_size = 128
nb_classes = 2
nb_epoch = 200
for i in range(20):
print "\n"
## Generate spectrograms if necessary
if(len(os.listdir("./AudioNormalPathalogicClassification/Image")) > 0):
print "Audio files are already processed. Skipping..."
else:
# Read the result csv
df = pd.read_csv('./AudioNormalPathalogicClassification/Result/AudioNormalPathalogicClassification_result.csv', header = None, encoding = "utf-8")
df.columns = ["RegionName","Filepath","IsNormal"]
bool_mapping = {True : 1, False : 0}
for col in df:
if(col == "RegionName" or col == "Filepath"):
a = 3
else:
df[col] = df[col].map(bool_mapping)
region_names = df.iloc[:,0].values
filepaths = df.iloc[:,1].values
y = df.iloc[:,2].values
#Generate spectrograms and make a new CSV file
print "Generating spectrograms for the audio files..."
result = ap.audio_2_image(filepaths, region_names, y, "./AudioNormalPathalogicClassification/Image/", ".png",(img_rows,img_cols))
df = pd.DataFrame(data = result)
df.to_csv("NormalVsPathalogic.csv",header= False, index = False, encoding = "utf-8")
# Load images into memory
print "Loading images into memory..."
df = pd.read_csv('NormalVsPathalogic.csv', header = None, encoding = "utf-8")
y = df.iloc[:,0].values
y = np_utils.to_categorical(y, nb_classes)
y = np.asarray(y)
X = df.iloc[:,1:].values
X = np.asarray(X)
X = X.reshape(X.shape[0], dim, img_rows, img_cols)
X = X.astype("float32")
X /= 255
print X.shape
model = Sequential()
model.add(Convolution2D(64, nb_conv, nb_conv,
border_mode='valid',
input_shape=(1, img_rows, img_cols)))
model.add(Activation('relu'))
model.add(Convolution2D(32, nb_conv, nb_conv))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adadelta')
print model.summary()
model.fit(X, y, batch_size = batch_size, nb_epoch = nb_epoch, show_accuracy = True, verbose = 1)
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
49576 次 |
| 最近记录: |