如何使用 columnTruncateLength: 232 和 numFramesPerSpectrogram: 43 将 wav 文件转换为 tensorflowjs 的频谱图?

abh*_*ray 5 javascript node.js tensorflow tensorflow.js

我正在尝试在离线模式下使用 tensorflowjs 语音识别。使用麦克风的在线模式工作正常。但是对于离线模式,我无法找到任何可靠的库,用于根据所需的数组规格将 wav/mp3 文件转换为频谱图,例如 ffttsize:1024 , columnTruncateLength: 232, numFramesPerSpectrogram: 43。

我尝试过的所有库(如 spectrogram.js)都没有这些转换选项。虽然 tensorlfowjs 的演讲清楚地提到了光谱仪张量的以下规范

const mic = await tf.data.microphone({
  fftSize: 1024,
  columnTruncateLength: 232,
  numFramesPerSpectrogram: 43,
  sampleRateHz:44100,
  includeSpectrogram: true,
  includeWaveform: true
});
Run Code Online (Sandbox Code Playgroud)

获取错误为错误:tensor4d() 需要values在以下平面数组时提供形状

await recognizer.ensureModelLoaded();
    var audiocaptcha = await response.buffer();
    fs.writeFile("./afterverify.mp3", audiocaptcha, function (err) {
        if (err) {}
    });
    var bufferNewSamples =  new Float32Array(audiocaptcha);

    const buffersliced = bufferNewSamples.slice(0,bufferNewSamples .length-(bufferNewSamples .length%9976));
    const xtensor = tf.tensor(bufferNewSamples).reshape([-1, 
...recognizer.modelInputShape().slice(1)]);
Run Code Online (Sandbox Code Playgroud)

切片并校正为张量后出现此错误

output.scores
[ Float32Array [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 ],
  Float32Array [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 ],
  Float32Array [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 ],
  Float32Array [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 ],
  Float32Array [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 ] ]
score for word '_background_noise_' = 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
score for word '_unknown_' = 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
score for word 'down' = 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
score for word 'eight' = 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
score for word 'five' = 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
score for word 'four' = undefined
score for word 'go' = undefined
score for word 'left' = undefined
score for word 'nine' = undefined
score for word 'no' = undefined
score for word 'one' = undefined
score for word 'right' = undefined
score for word 'seven' = undefined
score for word 'six' = undefined
score for word 'stop' = undefined
score for word 'three' = undefined
score for word 'two' = undefined
score for word 'up' = undefined
score for word 'yes' = undefined
score for word 'zero' = undefined
Run Code Online (Sandbox Code Playgroud)

edk*_*ked 3

使用离线识别时的唯一要求是具有形状为 的输入张量[null, 43, 232, 1]

1 - 读取wav文件并获取数据数组

var spectrogram = require('spectrogram');

var spectro = Spectrogram(document.getElementById('canvas'), {
  audio: {
    enable: false
  }
});

var audioContext = new AudioContext();

readWavFile() {
return new Promise(resove => {
var request = new XMLHttpRequest();
request.open('GET', 'audio.mp3', true);
request.responseType = 'arraybuffer';

request.onload = function() {
  audioContext.decodeAudioData(request.response, function(buffer) {
    resolve(buffer)
  });
};
request.send()
})

}

const buffer = await readWavFile()
Run Code Online (Sandbox Code Playgroud)

不使用第三方库也可以完成同样的事情。可以有 2 个选项。

  • 使用 读取文件<input type="file">。在这种情况下,这个答案展示了如何获取类型数组。

  • 使用 http 请求提供并读取 wav 文件

var spectrogram = require('spectrogram');

var spectro = Spectrogram(document.getElementById('canvas'), {
  audio: {
    enable: false
  }
});

var audioContext = new AudioContext();

readWavFile() {
return new Promise(resove => {
var request = new XMLHttpRequest();
request.open('GET', 'audio.mp3', true);
request.responseType = 'arraybuffer';

request.onload = function() {
  audioContext.decodeAudioData(request.response, function(buffer) {
    resolve(buffer)
  });
};
request.send()
})

}

const buffer = await readWavFile()
Run Code Online (Sandbox Code Playgroud)

2-将缓冲区转换为类型数组

const data = Float32Array(buffer)
Run Code Online (Sandbox Code Playgroud)

3-使用语音识别模型的形状将数组转换为张量

const x = tf.tensor(
   data).reshape([-1, ...recognizer.modelInputShape().slice(1));
Run Code Online (Sandbox Code Playgroud)

如果后面的命令失败,则意味着数据不具有模型所需的形状。张量需要被切片以获得适当的形状,或者所做的记录应该尊重fft和其他参数。

  • 为什么答案不完整?如果它不能解决您的问题,也许您可​​以考虑打开一个新的线程来解决您的问题 (2认同)