Python+whisper.cpp纯本地化语音转文字

想要用Python+whisper.cpp实现纯本地化语音转文字，我的操作环境如下：

MacOS Ventura 13.0
Python3.7
conda

PyAudio

一开始打算用PyAudio，解决了头文件找不到的问题（网上教程很多）之后，仍然遇到：

Could not import the PyAudio C module 'pyaudio._portaudio'.

Traceback (most recent call last):

File "<stdin>", line 1, in <module>

File "/.../site-packages/pyaudio/__init__.py", line 111, in <module>

import pyaudio._portaudio as pa

ImportError: dlopen(/.../site-packages/pyaudio/_portaudio.cpython-37-darwin.so, 0x0002): symbol not found in flat namespace '_PaMacCore_SetupChannelMap'

通过命令可以检查编译的so文件对应命名空间情况

nm -g /.../site-packages/pyaudio/_portaudio.cpython-37-darwin.so | grep " _PaMacCore_SetupChannelMap"

显示为:

U _PaMacCore_SetupChannelMap

这个 U 表示该符号未定义(undefined)。

另外检查发现 PaMacCore 相关的符号都是未定义的。

这说明在编译链接这个 Python 扩展模块时,没有正确包含 PaMacCore 相关的符号。反复尝试重装portaudio（homebrew包）和pyaudio，还是解决不了，目测是portaudio/pyaudio包对ventura版本的支持还是没做好，需等待进一步新版本，于是放弃pyaudio转而使用sounddevice。

sounddevice

一开始，我直接让AI帮忙基于sounddevice程序包给我一段代码，但怎么都录不上音乐，调整过输入设备指定等代码还是不行。于是从官方文档找了一段示例代码：https://python-sounddevice.readthedocs.io/en/0.4.6/examples.html#real-time-text-mode-spectrogram

让AI基于该代码去一次次修改，然后再回过头来删掉没什么用的代码，留下一个带注释的极简版本：

import sounddevice as sd
import numpy as np
import wave
import time

# 设置录音的时长和采样率
duration = 5  # seconds
sample_rate = 44100
channels = 1
device_info = 'MacBook Pro Microphone'

# 创建一个用于存储录制音频的缓冲区
buffer = []

# 定义回调函数来获取输入数据
def callback(indata, frames, time, status):
    global buffer
    buffer.append(indata.copy())

# 创建一个输入流来录制音频
print(device_info)
print(sample_rate)
with sd.InputStream(callback=callback, device=device_info, samplerate=sample_rate, channels=1):
    print('Recording...')
    sd.sleep(duration * 1000)
    print('Recording finished')

# 将缓冲区数据保存到WAV文件
buffer = np.concatenate(buffer, axis=0)
filename = 'recorded_audio.wav'

with wave.open(filename, 'w') as wf:
    wf.setnchannels(1)
    wf.setsampwidth(2)  # 2 bytes for 16-bit samples
    wf.setframerate(sample_rate)
    # Convert buffer to 16-bit integer before writing
    wav_data = (buffer * np.iinfo(np.int16).max).astype(np.int16)
    wf.writeframes(wav_data.tobytes())

print(f"File saved as {filename}")

终于把声音给录上了。回过头来查看，是因为buffer的定义，一开始AI给我生成的代码是：

# 创建一个用于存储录制音频的缓冲区
buffer = np.zeros((duration * sample_rate,))

其实只需要将buffer声明为空数组就可以了。

AI写代码坑还是很多啊，使用某个Python程序包的时候，最好可以先把官方文档里面的代码示例塞给AI之后再让他去生成代码，省掉来回折腾自己调试做各种问题排查。

结合whisper.cpp的调用将整体程序整合到如下，后面进一步优化再发到git上来：

import sounddevice as sd
import numpy as np
import wave
import os
import subprocess
import gradio as gr
import threading
import sys
import time
import struct

# 录音参数
SAMPLE_RATE = 44100
CHANNELS = 1
DTYPE = np.int16
SEGMENT_DURATION = 5  # 每5秒为一个段
device_info = 'MacBook Pro Microphone'
buffer = []
segment_files_path = '/.../buffers'
trans_target_file = '/.../trans.log'

# 录音回调
def callback(indata, frames, time, status):
    global buffer, segment_frames_recorded
    segment_frames_recorded += frames
    buffer.append(indata.copy())

def start_segment_recording(segment_filename):
    print("start_segment_recording start: ",segment_filename)
    global buffer, wavfile, recording, segment_frames_recorded
    segment_filename = "{}/{}".format(segment_files_path, segment_filename)
    buffer = []
    wavfile = wave.open(segment_filename, mode="wb")
    wavfile.setnchannels(CHANNELS)
    wavfile.setsampwidth(2)
    wavfile.setframerate(SAMPLE_RATE)
    segment_frames_recorded = 0
    start_time = time.time()
    with sd.InputStream(callback=callback, device=device_info, channels=CHANNELS, samplerate=SAMPLE_RATE) as stream:
        print("InputStream starting")
        while True:
            if time.time() - start_time > SEGMENT_DURATION:
                break
    print("Recording ended")
    buffer_mix = np.concatenate(buffer, axis=0)
    wav_data = (buffer_mix * np.iinfo(np.int16).max).astype(np.int16)
    wavfile.writeframes(wav_data.tobytes())
    stop_segment_recording(segment_filename)

def stop_segment_recording(segment_filename):
    print("stop_segment_recording start")
    global wavfile, trans_target_file
    # segment_filename = "{}/{}".format(segment_files_path, segment_filename)
    wavfile.close()
    transcription = run_whisper(segment_filename)
    print(transcription)
    with open(trans_target_file, 'a') as f:
        f.write(transcription + '\n')

def run_whisper(segment_filename):
    segment_filename_out = "{}{}".format(segment_filename[:-4],'_out.wav')
    command = f'ffmpeg -i {segment_filename} -acodec pcm_s16le -ac 1 -ar 16000 {segment_filename_out}'
    print(command)
    subprocess.run(command, shell=True)
    command = f'/.../whisper.cpp/main -m /.../whisper.cpp/models/ggml-large.bin -l zh --output-txt -f "{segment_filename_out}"'
    subprocess.run(command, shell=True)
    vtt_filename = segment_filename_out+".txt"
    with open(vtt_filename) as file:
        transcription = file.read()
    return transcription

def recording_thread():
    print("recording_thread start")
    segment_index = 0
    while recording:
        segment_filename = f"segment_{segment_index}.wav"
        start_segment_recording(segment_filename)
        segment_index += 1
try:
    global recording
    recording = True
    thread = threading.Thread(target=recording_thread())
    thread.start()
    print("Recording...")
except KeyboardInterrupt:
    sys.exit('Interrupted by user')
except Exception as e:
    sys.exit(type(e).__name__ + ': ' + str(e))