Lui*_*uis 5 python speech-to-text google-cloud-platform diarization
我正在谷歌云上运行一个虚拟机实例。我的目标是将说话人分类应用于存储在云存储桶上的多个 .wav 文件。
我尝试了以下替代方案来解决后续问题:
from google.cloud import speech_v1p1beta1 as speech
from google.cloud import storage
import os
import json
import sys
storage_client = storage.Client()
client = speech.SpeechClient()
if "--channel" in sys.argv:
index = sys.argv.index("--channel") + 1
if index < len(sys.argv):
channel = sys.argv[index]
print("Channel:", channel)
else:
print("--channel option requires a value")
audio_folder=f'audio_{channel}'
# channel='tve'
transcript_folder=f'transcript_output'
bucket = storage_client.bucket(audio_folder)
bucket2 = storage_client.bucket(transcript_folder)
wav_files=[i.name for i in bucket.list_blobs()]
json_files=[i.name.split(f'{channel}/')[-1] for i in bucket2.list_blobs(prefix=channel)]
for file in wav_files:
if not file.endswith('.wav'):
continue
transcript_name=file.replace('.wav','.json')
if transcript_name in json_files:
continue
gcs_uri = f"gs://{audio_folder}/{file}"
# gcs_uri = f"gs://{audio_folder}/out2.wav"
audio = speech.RecognitionAudio(uri=gcs_uri)
diarization_config = speech.SpeakerDiarizationConfig(
enable_speaker_diarization=True,
min_speaker_count=2,
#max_speaker_count=10,
)
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
#sample_rate_hertz=8000,
language_code="es-ES",
diarization_config=diarization_config,
#audio_channel_count = 2,
)
print("Waiting for operation to complete...")
operation = client.long_running_recognize(config=config, audio=audio)
response=operation.result()
result = response.results[-1]
# print(result)
# print(type(result))
with open(transcript_name,'w') as f:
json.dump(str(result),f)
# transcript_name=file.replace('.wav','.txt')
# result = response.results[-1]
# with open(transcript_name,'w') as f:
# f.write(result)
os.system(f'gsutil cp {transcript_name} gs://transcript_output/{channel}')
os.remove(transcript_name)
print(f'File {file} processed. ')
Run Code Online (Sandbox Code Playgroud)
无论 max_speaker 或 min 如何更改,结果都是相同的。
由于上述方法不起作用,我决定尝试使用 pyannote。它的性能非常好,但有一个问题,它非常慢。对于 30 分钟的 wav 文件,需要 3 个多小时才能完成二值化。
这是我的代码:
#import packages
import os
from datetime import datetime
import pandas as pd
from pyannote.audio import Pipeline
from pyannote.audio import Model
from pyannote.core.json import dump
from pyannote.core.json import load
from pyannote.core.json import loads
from pyannote.core.json import load_from
import subprocess
from pyannote.database.util import load_rttm
from google.cloud import speech_v1p1beta1 as speech
from google.cloud import storage
import sys
# channel='a3'
storage_client = storage.Client()
if "--channel" in sys.argv:
index = sys.argv.index("--channel") + 1
if index < len(sys.argv):
channel = sys.argv[index]
print("Channel:", channel)
else:
print("--channel option requires a value")
audio_folder=f'audio_{channel}'
transcript_folder=f'transcript_{channel}'
bucket = storage_client.bucket(audio_folder)
bucket2 = storage_client.bucket(transcript_folder)
wav_files=[i.name for i in bucket.list_blobs()]
rttm_files=[i.name for i in bucket2.list_blobs()]
token="XXX"
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
use_auth_token=token)
# this load the model
model = Model.from_pretrained("pyannote/segmentation",
use_auth_token=token)
for file in wav_files:
if not file.endswith('.wav'):
continue
rttm_name=file.replace('.wav','.rttm')
if rttm_name in rttm_files:
continue
if '2023' not in file:
continue
print(f'Doing file {file}')
gcs_uri = f"gs://{audio_folder}/{file}"
os.system(f'gsutil cp {gcs_uri} {file}')
diarization = pipeline(file)
with open(rttm_name, "w") as rttm:
diarization.write_rttm(rttm)
os.system(f'gsutil cp {rttm_name} gs://transcript_{channel}/{rttm_name}')
os.remove(file)
os.remove(rttm_name)
Run Code Online (Sandbox Code Playgroud)
我在具有 GPU NVIDIA-T4 的 VM 实例上使用 python3.9 运行此程序。
这是正常的吗?我发现 pyannote.audio 的速度慢了 1 倍左右,这次的时间要长得多,因为理论上它应该在专用的 GPU 上运行......
有没有更快的替代方案?有什么方法可以改进代码或设计可以提高速度的虚拟机?
小智 7
为了让这个在GPU上快速工作(以Google colab为例):你需要先安装pyannote:
!pip install -qq https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
Run Code Online (Sandbox Code Playgroud)
进而:
!pip install -qq https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
2249 次 |
| 最近记录: |