Python 离线语音歌词识别播放器（歌词导入导出txt文本）

__lost

37人浏览 · 2026-06-28 04:51:40

__lost · 2026-06-28 04:51:40 发布

本文介绍了一个高精度歌词播放器的Python实现，基于tkinter图形界面，具备以下核心功能：

音频处理功能：

支持MP3/WAV/FLAC格式播放
使用pydub+simpleaudio实现播放控制（播放/暂停/停止）

歌词处理功能：

加载标准TXT格式歌词文件（时间戳+文本）
通过Whisper large-v3模型实现高精度语音识别生成歌词
支持歌词导出为标准TXT格式

可视化效果：

动态彩色同步歌词显示
实时音频柱状频谱可视化
具备网络检测和FFmpeg依赖检查

技术特点：

采用多线程渲染保证UI流畅
实现歌词播放进度缓存机制
支持离线环境下的基本播放功能

该播放器适合需要精确歌词同步的场景，特别是中文歌曲的歌词显示和编辑需求。

艾北 - 游京(艾北版) 以下为歌词识别

00:00.00 我走在长街中 听戏子唱京城
00:04.86 人杂乱戏小丑 夜晃退入长秋
00:10.85 悠悠的孤城中
00:23.04 我走在长街中 听戏子唱京城
00:28.35 人杂乱戏小丑 夜望退入长秋
00:33.79 悠悠的孤城中 听美人奏琴声
00:38.57 不舍小声离她
00:45.46 滔滔江水 悠悠大云河畔
00:50.06 悠悠孤城 春心荡漾
00:55.17 我闻着冰香
00:58.07 悠悠的孤城 听喜子唱京城
00:58.34 来到了街中央
01:00.32 看街边都是火浪
01:03.14 我寻一朵吉祥
01:08.17 何处声云 我听琴声奏起
01:13.06 我寻神而去 原来有人在唱戏
01:18.65 游进繁华 你看美人蒙山
01:23.29 金袍跨马 微风凛凛
01:32.00 我走在长街中 听喜子唱京城
01:37.37 人杂乱些小畜 夜望退入长秋
01:42.81 悠悠的孤城中 听梅兰奏琴声
01:47.56 朗朗夜色星空 望海同繁华灯
01:52.90 盼望群起潦沙 也不解诉求
01:57.35 我走在长街中 听喜子唱京城
01:57.45 军载着黑军马 微风凛凛许云沓
02:03.20 破本一翠天涯 又走进西方花
02:07.98 不舍小生离他
02:33.59 我走在长街中 听喜子唱京城
02:38.63 人杂乱些小畜 夜望退入长秋
02:44.03 悠悠的孤城中 听美人奏琴声
02:48.81 朗朗夜色星空 望海同繁华灯
02:54.15 盼望群起潦沙 也不解诉求
02:57.43 我走在长街中 听喜子唱京城
02:59.56 军载着黑军马 微风凛凛许云沓
03:04.47 破本一翠天涯 又走进西方花
03:09.25 不舍小生离他

import tkinter as tk
from tkinter import ttk, filedialog, messagebox
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import time
import random
import whisper
import os
import subprocess
import tempfile
import socket
import re
from pydub import AudioSegment
import simpleaudio as sa

# 网络检测
def is_network_available():
    try:
        socket.create_connection(("www.baidu.com", 80), timeout=3)
        return True
    except:
        return False

# FFmpeg 全局检测
def check_ffmpeg():
    try:
        subprocess.check_output(["ffmpeg", "-version"], stderr=subprocess.STDOUT)
        return True
    except Exception:
        return False

# ===================== TXT本地歌词解析模块 =====================
class TxtLyricParser:
    def __init__(self, txt_path):
        self.lyric_list = []
        self.parse_txt_lyric(txt_path)

    def parse_txt_lyric(self, path):
        # 匹配 00:00.50 文本
        pattern = re.compile(r'(\d{2}):(\d{2})\.(\d{2})\s+(.*)')
        with open(path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        for line in lines:
            line = line.strip()
            res = pattern.match(line)
            if res:
                m = int(res.group(1))
                s = int(res.group(2))
                ms = int(res.group(3))
                text = res.group(4)
                sec = m * 60 + s + ms / 100
                self.lyric_list.append((sec, sec + 4, text))
        self.lyric_list.sort(key=lambda x: x[0])

# ===================== 本地语音识别（large-v3 高精度模型） =====================
class LocalAudioLyricGen:
    def __init__(self, model_root="./whisper_models"):
        self.model_name = "large-v3"  # 效果最好的官方大模型
        model_path = os.path.join(model_root, f"{self.model_name}.pt")

        if os.path.exists(model_path):
            self.model = whisper.load_model(self.model_name, download_root=model_root)
        else:
            if not is_network_available():
                raise ConnectionError(f"无网络，本地无{self.model_name}.pt模型，无法语音识别！")
            self.model = whisper.load_model(self.model_name, download_root=model_root)

    def audio_to_lrc_data(self, audio_path):
        print("高精度AI语音识别中，large-v3模型处理较慢请等待...")
        tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        tmp_path = tmp_wav.name
        tmp_wav.close()
        try:
            cmd = [
                "ffmpeg", "-y", "-i", audio_path,
                "-ar", "16000", "-ac", "1", "-f", "wav", tmp_path
            ]
            subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            result = self.model.transcribe(tmp_path, word_timestamps=True, language="zh")
        finally:
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)

        lyric_list = []
        for seg in result["segments"]:
            text = seg["text"].strip()
            start = seg["start"]
            end = seg["end"]
            if text:
                lyric_list.append((start, end, text))
        print("语音识别解析完成")
        return lyric_list

# ===================== 新版播放器（pydub+simpleaudio） =====================
class MusicPlayer:
    def __init__(self, root):
        self.root = root
        self.root.title("高精度歌词播放器 | TXT导入导出+动态柱状频谱")
        self.root.geometry("1220x820")
        self.root.configure(bg="#040428")

        # 音频播放变量
        self.music_path = ""
        self.audio_segment = None
        self.play_obj = None
        self.is_playing = False
        self.play_start_time = 0.0
        self.pause_offset = 0.0
        self.total_audio_sec = 0.0

        # 歌词数据（统一格式 [(start, end, text)]）
        self.lyric_time_data = []
        self.bar_count = 70
        self.bar_heights = np.zeros(self.bar_count)
        # 新增：缓存当前歌词下标，暂停定格歌词核心变量
        self.cached_lyric_idx = -1

        # 识别模块状态
        self.lyric_generator = None
        self.can_recognize = False

        # FFmpeg检测
        if not check_ffmpeg():
            messagebox.showerror("缺失FFmpeg", "未检测到FFmpeg，语音识别与音频解码全部失效！\n请下载FFmpeg并配置系统环境变量后重启程序")

        # 初始化语音识别
        try:
            self.lyric_generator = LocalAudioLyricGen()
            self.can_recognize = True
        except (FileNotFoundError, ConnectionError) as e:
            messagebox.showwarning("语音识别功能受限", f"{str(e)}\n播放器可正常播放音乐、加载TXT歌词")

        self.build_gui()
        self.update_render_loop()

    def build_gui(self):
        # 第一行：音频、歌词加载 & 导出按钮
        frame_load = tk.Frame(self.root, bg="#040428")
        frame_load.pack(fill=tk.X, padx=12, pady=6)
        tk.Button(
            frame_load, text="加载本地音频MP3/WAV",
            command=self.load_audio_file, bg="#109970", fg="white", font=("微软雅黑",10)
        ).pack(side=tk.LEFT, padx=5)
        tk.Button(
            frame_load, text="加载TXT歌词文件",
            command=self.load_txt_lyric, bg="#2288dd", fg="white", font=("微软雅黑",10)
        ).pack(side=tk.LEFT, padx=5)
        self.btn_recognize = tk.Button(
            frame_load, text="高精度AI生成歌词(large-v3)",
            command=self.generate_lyric_by_voice, bg="#9933dd", fg="white", font=("微软雅黑",10)
        )
        self.btn_recognize.pack(side=tk.LEFT, padx=5)
        tk.Button(
            frame_load, text="导出歌词到TXT文件",
            command=self.export_lyric_to_txt, bg="#dd7722", fg="white", font=("微软雅黑",10)
        ).pack(side=tk.LEFT, padx=5)
        if not self.can_recognize:
            self.btn_recognize.config(state=tk.DISABLED, bg="#666666")

        # 第二行：播放控制按钮 【修复点：分开实例化与pack，不再链式赋值】
        frame_ctrl = tk.Frame(self.root, bg="#040428")
        frame_ctrl.pack(fill=tk.X, padx=12, pady=6)
        self.btn_play = tk.Button(
            frame_ctrl, text="播放/暂停", command=self.toggle_play,
            bg="#dd4422", fg="white", font=("微软雅黑",10)
        )
        self.btn_play.pack(side=tk.LEFT, padx=5)

        tk.Button(
            frame_ctrl, text="停止（停在当前位置）", command=self.stop_audio_keep_pos,
            bg="#dd9922", fg="white", font=("微软雅黑",10)
        ).pack(side=tk.LEFT, padx=5)
        tk.Button(
            frame_ctrl, text="从头播放", command=self.play_from_start,
            bg="#22bb88", fg="white", font=("微软雅黑",10)
        ).pack(side=tk.LEFT, padx=5)

        # 歌词画布
        self.lyric_canvas = tk.Canvas(self.root, bg="#08083a", height=260)
        self.lyric_canvas.pack(fill=tk.X, padx=12, pady=10)

        # 柱状频谱画布
        self.fig, self.ax = plt.subplots(figsize=(12, 3.6), dpi=100)
        self.fig.patch.set_facecolor("#040428")
        self.ax.set_facecolor("#040428")
        self.bars = self.ax.bar(
            np.arange(self.bar_count),
            np.zeros(self.bar_count),
            width=0.72,
            color="#00eeff",
            edgecolor="#ffffff22"
        )
        self.ax.set_xlim(-1, self.bar_count)
        self.ax.set_ylim(0, 1.0)
        self.ax.set_xticks([])
        self.ax.set_yticks([])
        self.canvas_bar = FigureCanvasTkAgg(self.fig, master=self.root)
        self.canvas_bar.draw()
        self.canvas_bar.get_tk_widget().pack(fill=tk.BOTH, expand=True, padx=12, pady=8)

    # 【新增】导出当前歌词为标准TXT歌词文件
    def export_lyric_to_txt(self):
        if len(self.lyric_time_data) == 0:
            messagebox.showwarning("提示", "暂无歌词可导出！请先AI识别或加载TXT歌词")
            return
        # 选择保存路径
        save_path = filedialog.asksaveasfilename(
            defaultextension=".txt",
            filetypes=[("歌词文本文件", "*.txt")],
            initialfile="lyric_output.txt"
        )
        if not save_path:
            return
        try:
            with open(save_path, "w", encoding="utf-8") as f:
                for start_sec, end_sec, text in self.lyric_time_data:
                    # 秒 转 分:秒.毫秒 格式
                    minute = int(start_sec // 60)
                    sec = int(start_sec % 60)
                    ms = int((start_sec - int(start_sec)) * 100)
                    time_str = f"{minute:02d}:{sec:02d}.{ms:02d}"
                    f.write(f"{time_str} {text}\n")
            messagebox.showinfo("导出成功", f"歌词已保存至：\n{save_path}")
        except Exception as e:
            messagebox.showerror("导出失败", f"写入文件出错：{str(e)}")

    # 加载本地TXT歌词
    def load_txt_lyric(self):
        path = filedialog.askopenfilename(filetypes=[("文本歌词", "*.txt")])
        if not path:
            return
        try:
            parser = TxtLyricParser(path)
            self.lyric_time_data = parser.lyric_list
            self.cached_lyric_idx = -1  # 重置缓存下标
            if len(self.lyric_time_data) == 0:
                messagebox.showerror("解析失败", "txt格式不匹配！\n格式示例：00:00.50 歌词内容")
            else:
                messagebox.showinfo("加载成功", f"TXT歌词载入完成，共{len(self.lyric_time_data)}句")
        except Exception as e:
            messagebox.showerror("读取失败", str(e))

    def load_audio_file(self):
        path = filedialog.askopenfilename(
            filetypes=[("音频文件", "*.mp3 *.wav *.flac")]
        )
        if not path:
            return
        try:
            self.audio_segment = AudioSegment.from_file(path)
            self.total_audio_sec = len(self.audio_segment) / 1000.0
            self.music_path = path
        except Exception as e:
            messagebox.showerror("音频加载失败", f"无法读取音频：{str(e)}")
            return
        self.stop_audio()
        self.is_playing = False
        self.btn_play.config(text="播放/暂停")
        self.pause_offset = 0.0
        self.lyric_time_data.clear()
        self.cached_lyric_idx = -1  # 加载新音频清空歌词缓存
        messagebox.showinfo("提示", "音频加载完成，可加载TXT歌词或AI生成歌词")

    # 停止音频，保留当前播放进度
    def stop_audio_keep_pos(self):
        if self.play_obj and self.play_obj.is_playing():
            self.pause_offset += time.time() - self.play_start_time
        self.stop_audio()
        self.is_playing = False
        self.btn_play.config(text="播放/暂停")

    # 从头播放，重置进度+清空歌词缓存
    def play_from_start(self):
        if self.audio_segment is None:
            messagebox.showwarning("提示", "请先加载音频文件")
            return
        self.stop_audio()
        self.pause_offset = 0.0
        self.cached_lyric_idx = -1  # 从头播放重置歌词缓存
        self.is_playing = False
        self.toggle_play()

    def stop_audio(self):
        if self.play_obj and self.play_obj.is_playing():
            self.play_obj.stop()
        self.play_obj = None

    def toggle_play(self):
        if self.audio_segment is None:
            messagebox.showwarning("提示", "请先加载音频文件")
            return
        if self.is_playing:
            # 暂停，保存偏移
            self.pause_offset += time.time() - self.play_start_time
            self.stop_audio()
            self.is_playing = False
            self.btn_play.config(text="播放/暂停")
        else:
            current_ms = int(self.pause_offset * 1000)
            audio_slice = self.audio_segment[current_ms:]
            raw_data = audio_slice.raw_data
            sample_width = audio_slice.sample_width
            channels = audio_slice.channels
            rate = audio_slice.frame_rate
            self.play_obj = sa.play_buffer(raw_data, channels, sample_width, rate)
            self.play_start_time = time.time()
            self.is_playing = True
            self.btn_play.config(text="暂停")

    def generate_lyric_by_voice(self):
        if not self.can_recognize or self.lyric_generator is None:
            messagebox.showerror("不可用", "高精度语音识别未初始化成功")
            return
        if self.music_path == "":
            messagebox.showwarning("警告", "请先加载音频文件！")
            return
        self.root.config(cursor="wait")
        self.root.update()
        try:
            self.lyric_time_data = self.lyric_generator.audio_to_lrc_data(self.music_path)
            self.cached_lyric_idx = -1  # 新识别歌词重置缓存
            if len(self.lyric_time_data) == 0:
                messagebox.showerror("识别失败", "音频未检测到人声歌词")
            else:
                messagebox.showinfo("识别完成", f"高精度模型识别成功，共 {len(self.lyric_time_data)} 句歌词\n可点击【导出歌词到TXT】保存")
        except Exception as e:
            messagebox.showerror("识别异常", f"{str(e)}")
        self.root.config(cursor="")

    def draw_color_sync_lyric(self):
        self.lyric_canvas.delete("all")
        canvas_w = self.lyric_canvas.winfo_width()
        canvas_h = self.lyric_canvas.winfo_height()
        center_y = canvas_h // 2

        if len(self.lyric_time_data) == 0:
            tip = "加载音频后：①加载TXT歌词 ②或AI生成歌词 ③识别后可导出TXT"
            self.lyric_canvas.create_text(
                canvas_w//2, center_y,
                text=tip, font=("微软雅黑", 20), fill="#aaaacc"
            )
            return

        current_time = 0.0
        if self.is_playing:
            current_time = time.time() - self.play_start_time + self.pause_offset
            # 播放状态：实时匹配并更新缓存下标
            current_idx = -1
            for idx, (start, end, text) in enumerate(self.lyric_time_data):
                if start <= current_time <= end:
                    current_idx = idx
                    break
            # 找到有效行更新缓存
            if current_idx != -1:
                self.cached_lyric_idx = current_idx
        else:
            # 暂停状态：直接使用缓存下标，不再重新遍历匹配（核心修复）
            current_idx = self.cached_lyric_idx

        current_text = ""
        prev_text = ""
        next_text = ""
        total_lines = len(self.lyric_time_data)

        # 兜底处理缓存下标越界
        if 0 <= current_idx < total_lines:
            current_text = self.lyric_time_data[current_idx][2]
            if current_idx > 0:
                prev_text = self.lyric_time_data[current_idx - 1][2]
            if current_idx + 1 < total_lines:
                next_text = self.lyric_time_data[current_idx + 1][2]
        else:
            # 无缓存时显示第一行
            current_text = self.lyric_time_data[0][2] if total_lines > 0 else ""

        self.lyric_canvas.create_text(
            canvas_w//2, center_y - 75,
            text=prev_text, font=("微软雅黑", 16), fill="#7777aa"
        )
        self.lyric_canvas.create_text(
            canvas_w//2, center_y + 75,
            text=next_text, font=("微软雅黑", 16), fill="#7777aa"
        )

        color_list = ["#ff2266", "#ffaa22", "#ffff33", "#22ee99", "#22ccff", "#aa66ff"]
        start_x = canvas_w // 2 - len(current_text) * 22
        for char_idx, char in enumerate(current_text):
            fill_color = color_list[char_idx % len(color_list)]
            self.lyric_canvas.create_text(
                start_x + char_idx * 42, center_y,
                text=char, font=("微软雅黑", 38, "bold"), fill=fill_color
            )

    # 动态真实起伏柱状频谱（随播放时间变化振幅）
    def update_bar_spectrum_effect(self):
        base_amp = 0.04
        if self.is_playing:
            play_total_sec = self.pause_offset + (time.time() - self.play_start_time)
            t = play_total_sec
            freq1 = np.sin(t * 1.2)
            freq2 = np.sin(t * 3.7)
            freq3 = np.sin(np.linspace(0, 16 * np.pi, self.bar_count) + t * 5)
            noise = np.random.uniform(-0.22, 0.22, self.bar_count)
            base_amp = np.clip((np.abs(freq1) + np.abs(freq2)) / 2, 0.15, 0.85)
            wave = base_amp * np.abs(freq3) + noise
            bar_heights = np.clip(wave, 0, 1)
        else:
            bar_heights = np.full(self.bar_count, base_amp)

        for bar, h in zip(self.bars, bar_heights):
            bar.set_height(h)
        self.canvas_bar.draw()

    def update_render_loop(self):
        self.update_bar_spectrum_effect()
        self.draw_color_sync_lyric()
        self.root.after(33, self.update_render_loop)

if __name__ == "__main__":
    main_window = tk.Tk()
    app = MusicPlayer(main_window)
    main_window.mainloop()

AI Agent技术社区

Agent 垂直技术社区，欢迎活跃、内容共建。

更多推荐

AI Agent 的状态管理：工作流与图结构

相比于 FSM 的线性转移，图结构天然支持： | 能力 | 图结构支持 | 传统 FSM 支持 | |------|------------|---------------| | 分支条件判断 | ✅ 多条出边 | ⚠️ 需扩展 | | 循环与回溯 | ✅ 有向环 | ⚠️ 需特殊处理 | | 并行执行 | ✅ 多分支同步 | ❌ 不支持 | | 动态路由 | ✅ 运行时决定下一路径 | ⚠️ 受

AI Agent技术社区

AI Agent 的部署与运维：从原型到生产

这篇文章将系统梳理 AI Agent 从原型到生产的完整链路，涵盖容器化部署、服务化架构、负载均衡、版本管理、监控告警与故障恢复，并提供可直接落地的代码示例。在将 Agent 从原型阶段推向生产时，团队通常会遇到以下痛点： | 挑战类别 | 具体表现 | 潜在影响 | |---------|---------|---------| || 缺乏日志、指标和链路追踪 | 问题定位困难，故障恢复缓慢 |