Fresh start - excluded large ROM JSON files
This commit is contained in:
443
docker/discord-voice-bot/main.py
Normal file
443
docker/discord-voice-bot/main.py
Normal file
@@ -0,0 +1,443 @@
|
||||
"""
|
||||
Discord Voice Bot - Simple GLaDOS Voice Version
|
||||
Uses Wyoming Whisper for STT, Ollama for LLM, HTTP TTS for GLaDOS voice.
|
||||
Works WITHOUT discord.sinks (manual audio capture)
|
||||
"""
|
||||
|
||||
import logging
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import wave
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import numpy as np
|
||||
import requests
|
||||
import yaml
|
||||
import discord
|
||||
from discord.ext import commands
|
||||
import json
|
||||
|
||||
# Import Wyoming protocol
|
||||
try:
|
||||
from wyoming.client import AsyncTcpClient
|
||||
from wyoming.audio import AudioChunk, AudioStart, AudioStop
|
||||
from wyoming.asr import Transcribe, Transcript
|
||||
WYOMING_AVAILABLE = True
|
||||
except ImportError:
|
||||
logger.warning("Wyoming library not available")
|
||||
WYOMING_AVAILABLE = False
|
||||
|
||||
# Optional: Import GLaDOS ASR (Windows path)
|
||||
sys.path.insert(0, r'C:\glados\src')
|
||||
try:
|
||||
from glados.ASR import get_audio_transcriber
|
||||
GLADOS_ASR_AVAILABLE = True
|
||||
logger.info("GLaDOS ASR module found")
|
||||
except ImportError:
|
||||
GLADOS_ASR_AVAILABLE = False
|
||||
logger.warning("GLaDOS ASR not available")
|
||||
|
||||
|
||||
# Initialize GLaDOS ASR if available (fallback)
|
||||
parakeet_asr = None
|
||||
if GLADOS_ASR_AVAILABLE:
|
||||
try:
|
||||
logger.info("Loading GLaDOS Parakeet ASR model...")
|
||||
parakeet_asr = get_audio_transcriber(engine_type="tdt")
|
||||
logger.info("Parakeet ASR loaded")
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to load Parakeet ASR: {e}")
|
||||
|
||||
|
||||
class WyomingWhisper:
|
||||
"""Speech-to-text using Wyoming Whisper."""
|
||||
def __init__(self, host="localhost", port=10300):
|
||||
self.host = host
|
||||
self.port = port
|
||||
|
||||
async def transcribe(self, audio_bytes):
|
||||
"""Transcribe audio using Wyoming Whisper."""
|
||||
if not WYOMING_AVAILABLE:
|
||||
return None
|
||||
try:
|
||||
async with AsyncTcpClient(self.host, self.port) as client:
|
||||
await client.write_event(Transcribe().event())
|
||||
|
||||
chunk_size = 4096
|
||||
rate = 16000
|
||||
width = 2
|
||||
channels = 1
|
||||
|
||||
await client.write_event(AudioStart(
|
||||
rate=rate, width=width, channels=channels
|
||||
).event())
|
||||
|
||||
for i in range(0, len(audio_bytes), chunk_size):
|
||||
chunk = audio_bytes[i:i + chunk_size]
|
||||
await client.write_event(AudioChunk(
|
||||
audio=chunk, rate=rate, width=width, channels=channels
|
||||
).event())
|
||||
|
||||
await client.write_event(AudioStop().event())
|
||||
|
||||
while True:
|
||||
event = await client.read_event()
|
||||
if event is None:
|
||||
break
|
||||
if Transcript.is_type(event.type):
|
||||
transcript = Transcript.from_event(event)
|
||||
return transcript.text
|
||||
except Exception as e:
|
||||
logger.error(f"Wyoming Whisper error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
class ParakeetASR:
|
||||
"""Speech-to-text using GLaDOS Parakeet ASR (fallback)."""
|
||||
async def transcribe(self, audio_bytes):
|
||||
if not parakeet_asr:
|
||||
return None
|
||||
try:
|
||||
audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
|
||||
if len(audio_np) > 48000 * 30:
|
||||
audio_np = audio_np[:48000 * 30]
|
||||
ratio = 48000 // 16000
|
||||
audio_16k = audio_np[::ratio].astype(np.int16)
|
||||
audio_float = audio_16k.astype(np.float32)
|
||||
text = parakeet_asr.transcribe(audio_float)
|
||||
return text.strip() if text else None
|
||||
except Exception as e:
|
||||
logger.error(f"Parakeet ASR error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
class HTTPTTS:
|
||||
"""Text-to-speech using HTTP API."""
|
||||
def __init__(self, base_url, voice="glados"):
|
||||
self.base_url = base_url
|
||||
self.voice = voice
|
||||
|
||||
async def synthesize(self, text):
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.base_url}/v1/audio/speech",
|
||||
json={"input": text, "voice": self.voice},
|
||||
timeout=30
|
||||
)
|
||||
if response.status_code in [200, 201]:
|
||||
logger.info(f"Got TTS audio: {len(response.content)} bytes")
|
||||
return response.content
|
||||
except Exception as e:
|
||||
logger.error(f"TTS error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
class OllamaClient:
|
||||
"""Client for Ollama."""
|
||||
def __init__(self, base_url, model):
|
||||
self.base_url = base_url
|
||||
self.model = model
|
||||
|
||||
def generate(self, user_message):
|
||||
try:
|
||||
url = f"{self.base_url}/api/generate"
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"prompt": f"Keep responses concise and conversational. User: {user_message}",
|
||||
"stream": False
|
||||
}
|
||||
response = requests.post(url, json=payload, timeout=30)
|
||||
result = response.json()
|
||||
return result.get('response', '').strip()
|
||||
except Exception as e:
|
||||
logger.error(f"Ollama error: {e}")
|
||||
return "I'm sorry, I couldn't process that."
|
||||
|
||||
|
||||
# Load config
|
||||
config_path = os.path.join(os.path.dirname(__file__), 'config.yaml')
|
||||
with open(config_path, 'r') as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
# Components
|
||||
whisper_stt = WyomingWhisper(config['whisper']['host'], config['whisper']['port']) if WYOMING_AVAILABLE else None
|
||||
parakeet_stt = ParakeetASR()
|
||||
http_tts = HTTPTTS(config['tts']['http_url'], config['tts'].get('voice', 'glados'))
|
||||
ollama = OllamaClient(config['ollama']['base_url'], config['ollama']['model'])
|
||||
|
||||
|
||||
class VoiceBot(commands.Bot):
|
||||
"""Discord voice bot WITHOUT sinks dependency."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
intents = discord.Intents.default()
|
||||
intents.message_content = True
|
||||
intents.voice_states = True
|
||||
super().__init__(command_prefix="!", intents=intents, *args, **kwargs)
|
||||
self.voice_client = None
|
||||
self.config = config
|
||||
self._recording = False
|
||||
self._audio_buffer = bytearray()
|
||||
|
||||
async def on_ready(self):
|
||||
logger.info(f"Bot ready! {self.user.name} ({self.user.id})")
|
||||
logger.info("Use !join to connect to voice channel, !leave to disconnect")
|
||||
|
||||
async def on_message(self, message):
|
||||
if message.author == self.user:
|
||||
return
|
||||
await self.process_commands(message)
|
||||
|
||||
async def join_voice_channel(self, channel):
|
||||
if self.voice_client:
|
||||
await self.voice_client.disconnect()
|
||||
self.voice_client = await channel.connect()
|
||||
logger.info(f"Joined voice channel: {channel.name}")
|
||||
|
||||
def convert_discord_audio_to_parakeet(self, audio_bytes):
|
||||
"""Convert Discord 48kHz stereo PCM to 16kHz mono float32 for Parakeet."""
|
||||
try:
|
||||
# Discord audio is 48kHz, stereo, 16-bit PCM
|
||||
# Convert bytes to int16 numpy array
|
||||
audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
|
||||
|
||||
# Stereo to mono: average left and right channels
|
||||
audio_np = audio_np.reshape(-1, 2).mean(axis=1).astype(np.int16)
|
||||
|
||||
# Resample 48kHz to 16kHz (divide by 3)
|
||||
audio_16k = audio_np[::3]
|
||||
|
||||
# Convert int16 to float32 (normalize to [-1.0, 1.0])
|
||||
audio_float = audio_16k.astype(np.float32) / 32768.0
|
||||
|
||||
return audio_float
|
||||
except Exception as e:
|
||||
logger.error(f"Audio conversion error: {e}")
|
||||
return None
|
||||
|
||||
async def record_audio(self, duration=5):
|
||||
"""Record audio from voice channel for specified duration."""
|
||||
if not self.voice_client:
|
||||
logger.warning("Not in voice channel")
|
||||
return None
|
||||
|
||||
self._recording = True
|
||||
self._audio_buffer = bytearray()
|
||||
|
||||
logger.info(f"Recording for {duration} seconds...")
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
|
||||
while self._recording and (asyncio.get_event_loop().time() - start_time) < duration:
|
||||
try:
|
||||
# Try to get audio packet (non-blocking)
|
||||
packet = await asyncio.wait_for(
|
||||
self.voice_client.receive(),
|
||||
timeout=0.1
|
||||
)
|
||||
if packet and hasattr(packet, 'data'):
|
||||
self._audio_buffer.extend(packet.data)
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
except Exception as e:
|
||||
logger.debug(f"Recv error: {e}")
|
||||
continue
|
||||
|
||||
self._recording = False
|
||||
audio_data = bytes(self._audio_buffer)
|
||||
logger.info(f"Recorded {len(audio_data)} bytes")
|
||||
return audio_data
|
||||
|
||||
async def process_voice_command(self, ctx):
|
||||
"""Record, transcribe, get LLM response, and speak."""
|
||||
await ctx.send("🎙️ Listening... (speak now)")
|
||||
|
||||
# Record audio
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
audio_bytes = await self.record_audio(duration=5)
|
||||
record_time = asyncio.get_event_loop().time() - start_time
|
||||
|
||||
if not audio_bytes or len(audio_bytes) < 1000:
|
||||
await ctx.send("❌ No audio captured (too quiet or not in voice channel)")
|
||||
return
|
||||
|
||||
await ctx.send(f"📝 Transcribing ({len(audio_bytes)} bytes, {record_time:.1f}s)...")
|
||||
|
||||
# Convert audio format
|
||||
audio_float = self.convert_discord_audio_to_parakeet(audio_bytes)
|
||||
if audio_float is None:
|
||||
await ctx.send("❌ Audio conversion failed")
|
||||
return
|
||||
|
||||
# Transcribe with Parakeet
|
||||
transcribe_start = asyncio.get_event_loop().time()
|
||||
try:
|
||||
# Run transcription in thread pool (it's CPU intensive)
|
||||
loop = asyncio.get_event_loop()
|
||||
text = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: parakeet_asr.transcribe(audio_float)
|
||||
)
|
||||
transcribe_time = asyncio.get_event_loop().time() - transcribe_start
|
||||
except Exception as e:
|
||||
logger.error(f"Transcription error: {e}")
|
||||
await ctx.send(f"❌ Transcription failed: {e}")
|
||||
return
|
||||
|
||||
if not text or not text.strip():
|
||||
await ctx.send("❌ No speech detected")
|
||||
return
|
||||
|
||||
await ctx.send(f"👤 You said: \"{text}\" ({transcribe_time:.1f}s)")
|
||||
|
||||
# Get LLM response
|
||||
llm_start = asyncio.get_event_loop().time()
|
||||
response = ollama.generate(text)
|
||||
llm_time = asyncio.get_event_loop().time() - llm_start
|
||||
|
||||
if not response:
|
||||
await ctx.send("❌ LLM failed to respond")
|
||||
return
|
||||
|
||||
await ctx.send(f"🤖 GLaDOS: \"{response}\" ({llm_time:.1f}s)")
|
||||
|
||||
# Synthesize and speak
|
||||
tts_start = asyncio.get_event_loop().time()
|
||||
audio = await http_tts.synthesize(response)
|
||||
tts_time = asyncio.get_event_loop().time() - tts_start
|
||||
|
||||
if audio:
|
||||
await self.play_audio(audio)
|
||||
total_time = record_time + transcribe_time + llm_time + tts_time
|
||||
await ctx.send(f"⏱️ Total latency: {total_time:.1f}s (rec: {record_time:.1f}, stt: {transcribe_time:.1f}, llm: {llm_time:.1f}, tts: {tts_time:.1f})")
|
||||
else:
|
||||
await ctx.send("❌ TTS failed")
|
||||
|
||||
async def play_audio(self, audio_bytes):
|
||||
"""Play audio in voice channel."""
|
||||
if not self.voice_client:
|
||||
logger.warning("Not connected to voice channel")
|
||||
return False
|
||||
|
||||
if audio_bytes[:4] == b'RIFF':
|
||||
suffix = '.wav'
|
||||
else:
|
||||
suffix = '.mp3'
|
||||
|
||||
# Create a temp file for FFmpeg
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as temp:
|
||||
temp.write(audio_bytes)
|
||||
temp_path = temp.name
|
||||
|
||||
try:
|
||||
source = discord.FFmpegPCMAudio(temp_path)
|
||||
if self.voice_client.is_playing():
|
||||
self.voice_client.stop()
|
||||
self.voice_client.play(source)
|
||||
|
||||
# Wait for playback to finish
|
||||
while self.voice_client.is_playing():
|
||||
await asyncio.sleep(0.1)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error playing audio: {e}")
|
||||
return False
|
||||
finally:
|
||||
try:
|
||||
os.unlink(temp_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
bot = VoiceBot()
|
||||
|
||||
|
||||
@bot.command(name='leave')
|
||||
async def leave(ctx):
|
||||
"""Leave voice channel."""
|
||||
if bot.voice_client:
|
||||
await bot.voice_client.disconnect()
|
||||
bot.voice_client = None
|
||||
await ctx.send("Left voice channel.")
|
||||
|
||||
|
||||
@bot.command(name='join')
|
||||
async def join(ctx):
|
||||
"""Join voice channel."""
|
||||
if not ctx.author.voice:
|
||||
await ctx.send("You need to be in a voice channel!")
|
||||
return
|
||||
channel = ctx.author.voice.channel
|
||||
await bot.join_voice_channel(channel)
|
||||
await ctx.send(f"Joined {channel.name}!")
|
||||
|
||||
|
||||
@bot.command(name='test')
|
||||
async def test(ctx, *, text="Hello! This is a test."):
|
||||
"""Test TTS."""
|
||||
if not bot.voice_client:
|
||||
await ctx.send("Not in voice channel! Use !join first.")
|
||||
return
|
||||
|
||||
await ctx.send(f"🎙️ Saying: {text}")
|
||||
audio = await http_tts.synthesize(text)
|
||||
if audio:
|
||||
success = await bot.play_audio(audio)
|
||||
if not success:
|
||||
await ctx.send("Failed to play audio.")
|
||||
else:
|
||||
await ctx.send("TTS error.")
|
||||
|
||||
|
||||
@bot.command(name='say')
|
||||
async def say(ctx, *, text):
|
||||
"""Say text using TTS."""
|
||||
await test(ctx, text=text)
|
||||
|
||||
|
||||
@bot.command(name='listen')
|
||||
async def listen(ctx):
|
||||
"""Record voice for 5 seconds, transcribe, and respond."""
|
||||
if not bot.voice_client:
|
||||
await ctx.send("Not in voice channel! Use !join first.")
|
||||
return
|
||||
|
||||
if not parakeet_asr:
|
||||
await ctx.send("❌ Parakeet ASR not available. Check GLaDOS installation.")
|
||||
return
|
||||
|
||||
await bot.process_voice_command(ctx)
|
||||
|
||||
|
||||
@bot.command(name='ask')
|
||||
async def ask(ctx, *, question):
|
||||
"""Ask the LLM something (text only, for now)."""
|
||||
await ctx.send("🤔 Thinking...")
|
||||
response = ollama.generate(question)
|
||||
if response:
|
||||
await ctx.send(f"💬 {response}")
|
||||
# Also speak it if in voice channel
|
||||
if bot.voice_client:
|
||||
audio = await http_tts.synthesize(response)
|
||||
if audio:
|
||||
await bot.play_audio(audio)
|
||||
else:
|
||||
await ctx.send("Failed to get response.")
|
||||
|
||||
|
||||
async def main():
|
||||
token = config['discord']['token']
|
||||
if token.startswith("YOUR_"):
|
||||
logger.error("Configure Discord token in config.yaml!")
|
||||
return
|
||||
|
||||
logger.info("Starting Discord bot...")
|
||||
await bot.start(token)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user