""" Discord Voice Bot - Simple GLaDOS Voice Version Uses Wyoming Whisper for STT, Ollama for LLM, HTTP TTS for GLaDOS voice. Works WITHOUT discord.sinks (manual audio capture) """ import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) import asyncio import io import os import sys import tempfile import wave from concurrent.futures import ThreadPoolExecutor import numpy as np import requests import yaml import discord from discord.ext import commands import json # Import Wyoming protocol try: from wyoming.client import AsyncTcpClient from wyoming.audio import AudioChunk, AudioStart, AudioStop from wyoming.asr import Transcribe, Transcript WYOMING_AVAILABLE = True except ImportError: logger.warning("Wyoming library not available") WYOMING_AVAILABLE = False # Optional: Import GLaDOS ASR (Windows path) sys.path.insert(0, r'C:\glados\src') try: from glados.ASR import get_audio_transcriber GLADOS_ASR_AVAILABLE = True logger.info("GLaDOS ASR module found") except ImportError: GLADOS_ASR_AVAILABLE = False logger.warning("GLaDOS ASR not available") # Initialize GLaDOS ASR if available (fallback) parakeet_asr = None if GLADOS_ASR_AVAILABLE: try: logger.info("Loading GLaDOS Parakeet ASR model...") parakeet_asr = get_audio_transcriber(engine_type="tdt") logger.info("Parakeet ASR loaded") except Exception as e: logger.error(f"Failed to load Parakeet ASR: {e}") class WyomingWhisper: """Speech-to-text using Wyoming Whisper.""" def __init__(self, host="localhost", port=10300): self.host = host self.port = port async def transcribe(self, audio_bytes): """Transcribe audio using Wyoming Whisper.""" if not WYOMING_AVAILABLE: return None try: async with AsyncTcpClient(self.host, self.port) as client: await client.write_event(Transcribe().event()) chunk_size = 4096 rate = 16000 width = 2 channels = 1 await client.write_event(AudioStart( rate=rate, width=width, channels=channels ).event()) for i in range(0, len(audio_bytes), chunk_size): chunk = audio_bytes[i:i + chunk_size] await client.write_event(AudioChunk( audio=chunk, rate=rate, width=width, channels=channels ).event()) await client.write_event(AudioStop().event()) while True: event = await client.read_event() if event is None: break if Transcript.is_type(event.type): transcript = Transcript.from_event(event) return transcript.text except Exception as e: logger.error(f"Wyoming Whisper error: {e}") return None class ParakeetASR: """Speech-to-text using GLaDOS Parakeet ASR (fallback).""" async def transcribe(self, audio_bytes): if not parakeet_asr: return None try: audio_np = np.frombuffer(audio_bytes, dtype=np.int16) if len(audio_np) > 48000 * 30: audio_np = audio_np[:48000 * 30] ratio = 48000 // 16000 audio_16k = audio_np[::ratio].astype(np.int16) audio_float = audio_16k.astype(np.float32) text = parakeet_asr.transcribe(audio_float) return text.strip() if text else None except Exception as e: logger.error(f"Parakeet ASR error: {e}") return None class HTTPTTS: """Text-to-speech using HTTP API.""" def __init__(self, base_url, voice="glados"): self.base_url = base_url self.voice = voice async def synthesize(self, text): try: response = requests.post( f"{self.base_url}/v1/audio/speech", json={"input": text, "voice": self.voice}, timeout=30 ) if response.status_code in [200, 201]: logger.info(f"Got TTS audio: {len(response.content)} bytes") return response.content except Exception as e: logger.error(f"TTS error: {e}") return None class OllamaClient: """Client for Ollama.""" def __init__(self, base_url, model): self.base_url = base_url self.model = model def generate(self, user_message): try: url = f"{self.base_url}/api/generate" payload = { "model": self.model, "prompt": f"Keep responses concise and conversational. User: {user_message}", "stream": False } response = requests.post(url, json=payload, timeout=30) result = response.json() return result.get('response', '').strip() except Exception as e: logger.error(f"Ollama error: {e}") return "I'm sorry, I couldn't process that." # Load config config_path = os.path.join(os.path.dirname(__file__), 'config.yaml') with open(config_path, 'r') as f: config = yaml.safe_load(f) # Components whisper_stt = WyomingWhisper(config['whisper']['host'], config['whisper']['port']) if WYOMING_AVAILABLE else None parakeet_stt = ParakeetASR() http_tts = HTTPTTS(config['tts']['http_url'], config['tts'].get('voice', 'glados')) ollama = OllamaClient(config['ollama']['base_url'], config['ollama']['model']) class VoiceBot(commands.Bot): """Discord voice bot WITHOUT sinks dependency.""" def __init__(self, *args, **kwargs): intents = discord.Intents.default() intents.message_content = True intents.voice_states = True super().__init__(command_prefix="!", intents=intents, *args, **kwargs) self.voice_client = None self.config = config self._recording = False self._audio_buffer = bytearray() async def on_ready(self): logger.info(f"Bot ready! {self.user.name} ({self.user.id})") logger.info("Use !join to connect to voice channel, !leave to disconnect") async def on_message(self, message): if message.author == self.user: return await self.process_commands(message) async def join_voice_channel(self, channel): if self.voice_client: await self.voice_client.disconnect() self.voice_client = await channel.connect() logger.info(f"Joined voice channel: {channel.name}") def convert_discord_audio_to_parakeet(self, audio_bytes): """Convert Discord 48kHz stereo PCM to 16kHz mono float32 for Parakeet.""" try: # Discord audio is 48kHz, stereo, 16-bit PCM # Convert bytes to int16 numpy array audio_np = np.frombuffer(audio_bytes, dtype=np.int16) # Stereo to mono: average left and right channels audio_np = audio_np.reshape(-1, 2).mean(axis=1).astype(np.int16) # Resample 48kHz to 16kHz (divide by 3) audio_16k = audio_np[::3] # Convert int16 to float32 (normalize to [-1.0, 1.0]) audio_float = audio_16k.astype(np.float32) / 32768.0 return audio_float except Exception as e: logger.error(f"Audio conversion error: {e}") return None async def record_audio(self, duration=5): """Record audio from voice channel for specified duration.""" if not self.voice_client: logger.warning("Not in voice channel") return None self._recording = True self._audio_buffer = bytearray() logger.info(f"Recording for {duration} seconds...") start_time = asyncio.get_event_loop().time() while self._recording and (asyncio.get_event_loop().time() - start_time) < duration: try: # Try to get audio packet (non-blocking) packet = await asyncio.wait_for( self.voice_client.receive(), timeout=0.1 ) if packet and hasattr(packet, 'data'): self._audio_buffer.extend(packet.data) except asyncio.TimeoutError: continue except Exception as e: logger.debug(f"Recv error: {e}") continue self._recording = False audio_data = bytes(self._audio_buffer) logger.info(f"Recorded {len(audio_data)} bytes") return audio_data async def process_voice_command(self, ctx): """Record, transcribe, get LLM response, and speak.""" await ctx.send("🎙️ Listening... (speak now)") # Record audio start_time = asyncio.get_event_loop().time() audio_bytes = await self.record_audio(duration=5) record_time = asyncio.get_event_loop().time() - start_time if not audio_bytes or len(audio_bytes) < 1000: await ctx.send("❌ No audio captured (too quiet or not in voice channel)") return await ctx.send(f"📝 Transcribing ({len(audio_bytes)} bytes, {record_time:.1f}s)...") # Convert audio format audio_float = self.convert_discord_audio_to_parakeet(audio_bytes) if audio_float is None: await ctx.send("❌ Audio conversion failed") return # Transcribe with Parakeet transcribe_start = asyncio.get_event_loop().time() try: # Run transcription in thread pool (it's CPU intensive) loop = asyncio.get_event_loop() text = await loop.run_in_executor( None, lambda: parakeet_asr.transcribe(audio_float) ) transcribe_time = asyncio.get_event_loop().time() - transcribe_start except Exception as e: logger.error(f"Transcription error: {e}") await ctx.send(f"❌ Transcription failed: {e}") return if not text or not text.strip(): await ctx.send("❌ No speech detected") return await ctx.send(f"👤 You said: \"{text}\" ({transcribe_time:.1f}s)") # Get LLM response llm_start = asyncio.get_event_loop().time() response = ollama.generate(text) llm_time = asyncio.get_event_loop().time() - llm_start if not response: await ctx.send("❌ LLM failed to respond") return await ctx.send(f"🤖 GLaDOS: \"{response}\" ({llm_time:.1f}s)") # Synthesize and speak tts_start = asyncio.get_event_loop().time() audio = await http_tts.synthesize(response) tts_time = asyncio.get_event_loop().time() - tts_start if audio: await self.play_audio(audio) total_time = record_time + transcribe_time + llm_time + tts_time await ctx.send(f"⏱️ Total latency: {total_time:.1f}s (rec: {record_time:.1f}, stt: {transcribe_time:.1f}, llm: {llm_time:.1f}, tts: {tts_time:.1f})") else: await ctx.send("❌ TTS failed") async def play_audio(self, audio_bytes): """Play audio in voice channel.""" if not self.voice_client: logger.warning("Not connected to voice channel") return False if audio_bytes[:4] == b'RIFF': suffix = '.wav' else: suffix = '.mp3' # Create a temp file for FFmpeg with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as temp: temp.write(audio_bytes) temp_path = temp.name try: source = discord.FFmpegPCMAudio(temp_path) if self.voice_client.is_playing(): self.voice_client.stop() self.voice_client.play(source) # Wait for playback to finish while self.voice_client.is_playing(): await asyncio.sleep(0.1) return True except Exception as e: logger.error(f"Error playing audio: {e}") return False finally: try: os.unlink(temp_path) except: pass bot = VoiceBot() @bot.command(name='leave') async def leave(ctx): """Leave voice channel.""" if bot.voice_client: await bot.voice_client.disconnect() bot.voice_client = None await ctx.send("Left voice channel.") @bot.command(name='join') async def join(ctx): """Join voice channel.""" if not ctx.author.voice: await ctx.send("You need to be in a voice channel!") return channel = ctx.author.voice.channel await bot.join_voice_channel(channel) await ctx.send(f"Joined {channel.name}!") @bot.command(name='test') async def test(ctx, *, text="Hello! This is a test."): """Test TTS.""" if not bot.voice_client: await ctx.send("Not in voice channel! Use !join first.") return await ctx.send(f"🎙️ Saying: {text}") audio = await http_tts.synthesize(text) if audio: success = await bot.play_audio(audio) if not success: await ctx.send("Failed to play audio.") else: await ctx.send("TTS error.") @bot.command(name='say') async def say(ctx, *, text): """Say text using TTS.""" await test(ctx, text=text) @bot.command(name='listen') async def listen(ctx): """Record voice for 5 seconds, transcribe, and respond.""" if not bot.voice_client: await ctx.send("Not in voice channel! Use !join first.") return if not parakeet_asr: await ctx.send("❌ Parakeet ASR not available. Check GLaDOS installation.") return await bot.process_voice_command(ctx) @bot.command(name='ask') async def ask(ctx, *, question): """Ask the LLM something (text only, for now).""" await ctx.send("🤔 Thinking...") response = ollama.generate(question) if response: await ctx.send(f"💬 {response}") # Also speak it if in voice channel if bot.voice_client: audio = await http_tts.synthesize(response) if audio: await bot.play_audio(audio) else: await ctx.send("Failed to get response.") async def main(): token = config['discord']['token'] if token.startswith("YOUR_"): logger.error("Configure Discord token in config.yaml!") return logger.info("Starting Discord bot...") await bot.start(token) if __name__ == '__main__': asyncio.run(main())