我可以信任哪种工具?

I a*_*Fat 6 python plot matplotlib librosa kaldi

我似乎不得不确定哪些工具可以信任......

我一直在测试的工具是Librosa和Kaldi,用于创建数据集,用于绘制音频文件的40个滤波器组能量的可视化.

使用kaldi中的这些配置提取滤波器组能量.

fbank.conf

--htk-compat=false
--window-type=hamming
--sample-frequency=16000
--num-mel-bins=40
--use-log-fbank=true
Run Code Online (Sandbox Code Playgroud)

使用librosa绘图绘制提取的数据.Librosa利用matplotlib pcolormesh,这意味着不应该有任何区别,除了librosa提供更容易使用的API.

print static.shape
print type(static)
print np.min(static)
print np.max(static)
fig = plt.figure()
librosa.display.specshow(static.T,sr=16000,x_axis='frames',y_axis='mel',hop_length=160,cmap=cm.jet)
#plt.axis('off')
plt.title("log mel power spectrum of " + name)
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()
plt.savefig(plot+"/"+name+"_plot_static_conv.png")
plt.show()
Run Code Online (Sandbox Code Playgroud)

输出:

(474, 40)
<type 'numpy.ndarray'>
-1.828067
22.70058
Got bus address:  "unix:abstract=/tmp/dbus-aYbBS1JWyw,guid=17dd413abcda54272e1d93d159174cdf" 
Connected to accessibility bus at:  "unix:abstract=/tmp/dbus-aYbBS1JWyw,guid=17dd413abcda54272e1d93d159174cdf" 
Registered DEC:  true 
Registered event listener change listener:  true 
Run Code Online (Sandbox Code Playgroud)

在此输入图像描述

在Librosa中创建的类似情节如下:

audio_path="../../../../Dropbox/SI1392.wav"
#audio_path = librosa.util.example_audio_file()
print "Example audio found"
y, sr = librosa.load(audio_path)
print "Example audio loaded"
specto = librosa.feature.melspectrogram(y, sr=sr, n_fft=400, hop_length=160, n_mels=40)
print "Example audio spectogram"
log_specto = librosa.core.logamplitude(specto)

print "min and max"
print np.min(log_specto)
print np.max(log_specto)
print "Example audio log specto"

plt.figure(figsize=(12,4))
librosa.display.specshow(log_specto,sr=sr,x_axis='frames', y_axis='mel', hop_length=160,cmap=cm.jet)

plt.title('mel power spectrogram')

plt.colorbar(format='%+02.0f dB')

plt.tight_layout()
print "See"

print specto.shape

print log_specto.shape
plt.show()
Run Code Online (Sandbox Code Playgroud)

输出:

libraries loaded!
Example audio found
Example audio loaded
Example audio spectogram
min and max
-84.6796661558
-4.67966615584
Example audio log specto
See
(40, 657)
(40, 657)
Run Code Online (Sandbox Code Playgroud)

在此输入图像描述

尽管有颜色,两者都显示相似的图,但能量范围似乎有点不同.

Kaldi的最小值/最大值为-1.828067/22.70058

Librosa的最小值/最大值为-84.6796661558/-4.67966615584

问题是我试图将这些图存储为numpy数组,以便进一步处理.

这似乎创建了不同的图.使用Librosa数据,我创建的图如下:

plt.figure()
min_max_scaled_log_specto = min_max_scaler.fit_transform(log_specto)
convert = plt.get_cmap(cm.jet)
numpy_static = convert(min_max_scaled_log_specto)
plt.imshow(np.flipud(log_specto), aspect='auto')
plt.colorbar()
print "Sooo?"
plt.show()
Run Code Online (Sandbox Code Playgroud)

在此输入图像描述

哪个是完美的...它类似于原始数据集..

但是对于Kaldi,我从这段代码得到了这个情节:

convert = plt.get_cmap(cm.jet)
numpy_output_static = convert(np.flipud(static.T))
plt.imshow(numpy_output_static,aspect = 'auto')
plt.show()
raw_input("sadas")
Run Code Online (Sandbox Code Playgroud)

在此输入图像描述

我从之前的帖子中发现,红色出现的原因可能是由于范围,之前的标准化会有所帮助 - 但这导致了这个:

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
convert = plt.get_cmap(cm.jet)
numpy_output_static = convert(min_max_scaler.fit_transform(np.flipud(static.T)))
plt.imshow(numpy_output_static,aspect = 'auto')
plt.show()
Run Code Online (Sandbox Code Playgroud)

在此输入图像描述

但这绝不会与Kaldi情节中的原始情节有关......那么为什么它看起来像这样呢?为什么我能用从Librosa提取的能量来绘制它,而不是从Kaldi中提取?

Librosa的最小工作示例:

#
#   Minimal example of Librosa plot example.
#   Made for testing the plot, and test for accurat
#   Conversion between the two parts.
#

import os
import sys
from os import listdir
from os.path import isfile, join
import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import Normalize
import matplotlib
from PIL import Image
import librosa
import colormaps as cmaps
import librosa.display
import ast
from scipy.misc import toimage
from matplotlib import cm
from sklearn import preprocessing

print "libraries loaded!"
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))

audio_path="../../../../Dropbox/SI1392.wav"
#audio_path = librosa.util.example_audio_file()
print "Example audio found"
y, sr = librosa.load(audio_path)
print "Example audio loaded"
specto = librosa.feature.melspectrogram(y, sr=sr, n_fft=400, hop_length=160, n_mels=40)
print "Example audio spectogram"
log_specto = librosa.core.logamplitude(specto)

print "min and max"
print np.min(log_specto)
print np.max(log_specto)
print "Example audio log specto"

plt.figure(figsize=(12,4))
librosa.display.specshow(log_specto,sr=sr,x_axis='frames', y_axis='mel', hop_length=160,cmap=cm.jet)

plt.title('mel power spectrogram')

plt.colorbar(format='%+02.0f dB')

plt.tight_layout()
print "See"
#plt.show()

print specto.shape

print log_specto.shape

plt.figure()
min_max_scaled_log_specto = min_max_scaler.fit_transform(log_specto)
convert = plt.get_cmap(cm.jet)
numpy_static = convert(min_max_scaled_log_specto)
plt.imshow(np.flipud(log_specto), aspect='auto')
plt.colorbar()
print "Sooo?"
plt.show()
Run Code Online (Sandbox Code Playgroud)

使用kaldi的最小工作示例 - (实际数据):

#
#   Extracted version:
#
#
#

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from PIL import Image
import librosa
import librosa.display
from matplotlib import cm
from sklearn import preprocessing
import ast
import urllib
import os
import sys
from os import listdir
from os.path import isfile, join

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))

def make_plot_store_data(name,interweaved,static,delta,delta_delta,isTrain,isTest,isDev):

    print static.shape
    print type(static)
    print np.min(static)
    print np.max(static)
    fig = plt.figure()

    librosa.display.specshow(static.T,sr=16000,x_axis='frames',y_axis='mel',hop_length=160,cmap=cm.jet)
    #plt.axis('off')
    plt.title("log mel power spectrum of " + name)
    plt.colorbar(format='%+02.0f dB')
    plt.tight_layout()
    #plt.show()
    #plt.close()
    #raw_input("asd")

    if isTrain == True:
        plt.figure()
        convert = plt.get_cmap(cm.jet)
        numpy_output_static = convert(min_max_scaler.fit_transform(np.flipud(static.T)))
        plt.imshow(numpy_output_static,aspect = 'auto')
        plt.show()
        raw_input("sadas")

link = "https://gist.githubusercontent.com/Miail/51311b34f5e5333bbddf9cb17c737ea4/raw/786b72477190023e93b9dd0cbbb43284ab59921b/feature.txt"
f = urllib.urlopen(link)

temp_list = []
for line in f:
    entries = 0
    data_splitted = line.split()
    if len(data_splitted) == 2:
            file_name = data_splitted[0]
    else:
        entries = 1+entries
        if data_splitted[-1] == ']':
            temp_list.extend([ast.literal_eval(i) for i in data_splitted[:-1]])
        else:
            temp_list.extend([ast.literal_eval(i) for i in data_splitted])


dimension = 120
entries = len(temp_list)/dimension
data = np.array(temp_list)
interweaved = data.reshape(entries,dimension)
static =interweaved[:,:-80]
delta =interweaved[:,40:-40]
delta_delta =interweaved[:,80:]
plot_interweaved = data.reshape(entries*3,dimension/3)
print static.shape
print delta.shape
print delta_delta.shape
make_plot_store_data(file_name,plot_interweaved,static,delta,delta_delta,True,False,False)
Run Code Online (Sandbox Code Playgroud)

I a*_*Fat 3

我似乎在这篇文章中找到了答案。问题是我的正常化。所以不要这样做:

numpy_output_static = convert(min_max_scaler.fit_transform(np.flipud(static.T)))
Run Code Online (Sandbox Code Playgroud)

我应该这样做:

norm_static = matplotlib.colors.Normalize(vmin=static.min(),vmax=static.max())
numpy_output_static = convert(norm_static(np.flipud(static.T)))
Run Code Online (Sandbox Code Playgroud)