想要用Python+whisper.cpp实现纯本地化语音转文字,我的操作环境如下:
MacOS Ventura 13.0
Python3.7
conda
PyAudio
一开始打算用PyAudio,解决了头文件找不到的问题(网上教程很多)之后,仍然遇到:
Could not import the PyAudio C module 'pyaudio._portaudio'.
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/.../site-packages/pyaudio/__init__.py", line 111, in <module>
import pyaudio._portaudio as pa
ImportError: dlopen(/.../site-packages/pyaudio/_portaudio.cpython-37-darwin.so, 0x0002): symbol not found in flat namespace '_PaMacCore_SetupChannelMap'
通过命令可以检查编译的so文件对应命名空间情况
nm -g /.../site-packages/pyaudio/_portaudio.cpython-37-darwin.so | grep " _PaMacCore_SetupChannelMap"
显示为:
U _PaMacCore_SetupChannelMap
这个 U 表示该符号未定义(undefined)。
另外检查发现 PaMacCore 相关的符号都是未定义的。
这说明在编译链接这个 Python 扩展模块时,没有正确包含 PaMacCore 相关的符号。反复尝试重装portaudio(homebrew包)和pyaudio,还是解决不了,目测是portaudio/pyaudio包对ventura版本的支持还是没做好,需等待进一步新版本,于是放弃pyaudio转而使用sounddevice。
sounddevice
一开始,我直接让AI帮忙基于sounddevice程序包给我一段代码,但怎么都录不上音乐,调整过输入设备指定等代码还是不行。于是从官方文档找了一段示例代码:https://python-sounddevice.readthedocs.io/en/0.4.6/examples.html#real-time-text-mode-spectrogram
让AI基于该代码去一次次修改,然后再回过头来删掉没什么用的代码,留下一个带注释的极简版本:
import sounddevice as sd
import numpy as np
import wave
import time
# 设置录音的时长和采样率
duration = 5 # seconds
sample_rate = 44100
channels = 1
device_info = 'MacBook Pro Microphone'
# 创建一个用于存储录制音频的缓冲区
buffer = []
# 定义回调函数来获取输入数据
def callback(indata, frames, time, status):
global buffer
buffer.append(indata.copy())
# 创建一个输入流来录制音频
print(device_info)
print(sample_rate)
with sd.InputStream(callback=callback, device=device_info, samplerate=sample_rate, channels=1):
print('Recording...')
sd.sleep(duration * 1000)
print('Recording finished')
# 将缓冲区数据保存到WAV文件
buffer = np.concatenate(buffer, axis=0)
filename = 'recorded_audio.wav'
with wave.open(filename, 'w') as wf:
wf.setnchannels(1)
wf.setsampwidth(2) # 2 bytes for 16-bit samples
wf.setframerate(sample_rate)
# Convert buffer to 16-bit integer before writing
wav_data = (buffer * np.iinfo(np.int16).max).astype(np.int16)
wf.writeframes(wav_data.tobytes())
print(f"File saved as {filename}")
终于把声音给录上了。回过头来查看,是因为buffer的定义,一开始AI给我生成的代码是:
# 创建一个用于存储录制音频的缓冲区
buffer = np.zeros((duration * sample_rate,))
其实只需要将buffer声明为空数组就可以了。
AI写代码坑还是很多啊,使用某个Python程序包的时候,最好可以先把官方文档里面的代码示例塞给AI之后再让他去生成代码,省掉来回折腾自己调试做各种问题排查。
结合whisper.cpp的调用将整体程序整合到如下,后面进一步优化再发到git上来:
import sounddevice as sd
import numpy as np
import wave
import os
import subprocess
import gradio as gr
import threading
import sys
import time
import struct
# 录音参数
SAMPLE_RATE = 44100
CHANNELS = 1
DTYPE = np.int16
SEGMENT_DURATION = 5 # 每5秒为一个段
device_info = 'MacBook Pro Microphone'
buffer = []
segment_files_path = '/.../buffers'
trans_target_file = '/.../trans.log'
# 录音回调
def callback(indata, frames, time, status):
global buffer, segment_frames_recorded
segment_frames_recorded += frames
buffer.append(indata.copy())
def start_segment_recording(segment_filename):
print("start_segment_recording start: ",segment_filename)
global buffer, wavfile, recording, segment_frames_recorded
segment_filename = "{}/{}".format(segment_files_path, segment_filename)
buffer = []
wavfile = wave.open(segment_filename, mode="wb")
wavfile.setnchannels(CHANNELS)
wavfile.setsampwidth(2)
wavfile.setframerate(SAMPLE_RATE)
segment_frames_recorded = 0
start_time = time.time()
with sd.InputStream(callback=callback, device=device_info, channels=CHANNELS, samplerate=SAMPLE_RATE) as stream:
print("InputStream starting")
while True:
if time.time() - start_time > SEGMENT_DURATION:
break
print("Recording ended")
buffer_mix = np.concatenate(buffer, axis=0)
wav_data = (buffer_mix * np.iinfo(np.int16).max).astype(np.int16)
wavfile.writeframes(wav_data.tobytes())
stop_segment_recording(segment_filename)
def stop_segment_recording(segment_filename):
print("stop_segment_recording start")
global wavfile, trans_target_file
# segment_filename = "{}/{}".format(segment_files_path, segment_filename)
wavfile.close()
transcription = run_whisper(segment_filename)
print(transcription)
with open(trans_target_file, 'a') as f:
f.write(transcription + '\n')
def run_whisper(segment_filename):
segment_filename_out = "{}{}".format(segment_filename[:-4],'_out.wav')
command = f'ffmpeg -i {segment_filename} -acodec pcm_s16le -ac 1 -ar 16000 {segment_filename_out}'
print(command)
subprocess.run(command, shell=True)
command = f'/.../whisper.cpp/main -m /.../whisper.cpp/models/ggml-large.bin -l zh --output-txt -f "{segment_filename_out}"'
subprocess.run(command, shell=True)
vtt_filename = segment_filename_out+".txt"
with open(vtt_filename) as file:
transcription = file.read()
return transcription
def recording_thread():
print("recording_thread start")
segment_index = 0
while recording:
segment_filename = f"segment_{segment_index}.wav"
start_segment_recording(segment_filename)
segment_index += 1
try:
global recording
recording = True
thread = threading.Thread(target=recording_thread())
thread.start()
print("Recording...")
except KeyboardInterrupt:
sys.exit('Interrupted by user')
except Exception as e:
sys.exit(type(e).__name__ + ': ' + str(e))