Fresh start - excluded large ROM JSON files

2026-04-11 09:45:12 -05:00
commit 5deb387aa6
395 changed files with 47744 additions and 0 deletions
--- a/docker/discord-voice-bot/main.py
+++ b/docker/discord-voice-bot/main.py
@@ -0,0 +1,443 @@
+"""
+Discord Voice Bot - Simple GLaDOS Voice Version
+Uses Wyoming Whisper for STT, Ollama for LLM, HTTP TTS for GLaDOS voice.
+Works WITHOUT discord.sinks (manual audio capture)
+"""
+
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+import asyncio
+import io
+import os
+import sys
+import tempfile
+import wave
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+import requests
+import yaml
+import discord
+from discord.ext import commands
+import json
+
+# Import Wyoming protocol
+try:
+    from wyoming.client import AsyncTcpClient
+    from wyoming.audio import AudioChunk, AudioStart, AudioStop
+    from wyoming.asr import Transcribe, Transcript
+    WYOMING_AVAILABLE = True
+except ImportError:
+    logger.warning("Wyoming library not available")
+    WYOMING_AVAILABLE = False
+
+# Optional: Import GLaDOS ASR (Windows path)
+sys.path.insert(0, r'C:\glados\src')
+try:
+    from glados.ASR import get_audio_transcriber
+    GLADOS_ASR_AVAILABLE = True
+    logger.info("GLaDOS ASR module found")
+except ImportError:
+    GLADOS_ASR_AVAILABLE = False
+    logger.warning("GLaDOS ASR not available")
+
+
+# Initialize GLaDOS ASR if available (fallback)
+parakeet_asr = None
+if GLADOS_ASR_AVAILABLE:
+    try:
+        logger.info("Loading GLaDOS Parakeet ASR model...")
+        parakeet_asr = get_audio_transcriber(engine_type="tdt")
+        logger.info("Parakeet ASR loaded")
+    except Exception as e:
+        logger.error(f"Failed to load Parakeet ASR: {e}")
+
+
+class WyomingWhisper:
+    """Speech-to-text using Wyoming Whisper."""
+    def __init__(self, host="localhost", port=10300):
+        self.host = host
+        self.port = port
+    
+    async def transcribe(self, audio_bytes):
+        """Transcribe audio using Wyoming Whisper."""
+        if not WYOMING_AVAILABLE:
+            return None
+        try:
+            async with AsyncTcpClient(self.host, self.port) as client:
+                await client.write_event(Transcribe().event())
+                
+                chunk_size = 4096
+                rate = 16000
+                width = 2
+                channels = 1
+                
+                await client.write_event(AudioStart(
+                    rate=rate, width=width, channels=channels
+                ).event())
+                
+                for i in range(0, len(audio_bytes), chunk_size):
+                    chunk = audio_bytes[i:i + chunk_size]
+                    await client.write_event(AudioChunk(
+                        audio=chunk, rate=rate, width=width, channels=channels
+                    ).event())
+                
+                await client.write_event(AudioStop().event())
+                
+                while True:
+                    event = await client.read_event()
+                    if event is None:
+                        break
+                    if Transcript.is_type(event.type):
+                        transcript = Transcript.from_event(event)
+                        return transcript.text
+        except Exception as e:
+            logger.error(f"Wyoming Whisper error: {e}")
+            return None
+
+
+class ParakeetASR:
+    """Speech-to-text using GLaDOS Parakeet ASR (fallback)."""
+    async def transcribe(self, audio_bytes):
+        if not parakeet_asr:
+            return None
+        try:
+            audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
+            if len(audio_np) > 48000 * 30:
+                audio_np = audio_np[:48000 * 30]
+            ratio = 48000 // 16000
+            audio_16k = audio_np[::ratio].astype(np.int16)
+            audio_float = audio_16k.astype(np.float32)
+            text = parakeet_asr.transcribe(audio_float)
+            return text.strip() if text else None
+        except Exception as e:
+            logger.error(f"Parakeet ASR error: {e}")
+            return None
+
+
+class HTTPTTS:
+    """Text-to-speech using HTTP API."""
+    def __init__(self, base_url, voice="glados"):
+        self.base_url = base_url
+        self.voice = voice
+
+    async def synthesize(self, text):
+        try:
+            response = requests.post(
+                f"{self.base_url}/v1/audio/speech",
+                json={"input": text, "voice": self.voice},
+                timeout=30
+            )
+            if response.status_code in [200, 201]:
+                logger.info(f"Got TTS audio: {len(response.content)} bytes")
+                return response.content
+        except Exception as e:
+            logger.error(f"TTS error: {e}")
+            return None
+
+
+class OllamaClient:
+    """Client for Ollama."""
+    def __init__(self, base_url, model):
+        self.base_url = base_url
+        self.model = model
+
+    def generate(self, user_message):
+        try:
+            url = f"{self.base_url}/api/generate"
+            payload = {
+                "model": self.model,
+                "prompt": f"Keep responses concise and conversational. User: {user_message}",
+                "stream": False
+            }
+            response = requests.post(url, json=payload, timeout=30)
+            result = response.json()
+            return result.get('response', '').strip()
+        except Exception as e:
+            logger.error(f"Ollama error: {e}")
+            return "I'm sorry, I couldn't process that."
+
+
+# Load config
+config_path = os.path.join(os.path.dirname(__file__), 'config.yaml')
+with open(config_path, 'r') as f:
+    config = yaml.safe_load(f)
+
+# Components
+whisper_stt = WyomingWhisper(config['whisper']['host'], config['whisper']['port']) if WYOMING_AVAILABLE else None
+parakeet_stt = ParakeetASR()
+http_tts = HTTPTTS(config['tts']['http_url'], config['tts'].get('voice', 'glados'))
+ollama = OllamaClient(config['ollama']['base_url'], config['ollama']['model'])
+
+
+class VoiceBot(commands.Bot):
+    """Discord voice bot WITHOUT sinks dependency."""
+
+    def __init__(self, *args, **kwargs):
+        intents = discord.Intents.default()
+        intents.message_content = True
+        intents.voice_states = True
+        super().__init__(command_prefix="!", intents=intents, *args, **kwargs)
+        self.voice_client = None
+        self.config = config
+        self._recording = False
+        self._audio_buffer = bytearray()
+
+    async def on_ready(self):
+        logger.info(f"Bot ready! {self.user.name} ({self.user.id})")
+        logger.info("Use !join to connect to voice channel, !leave to disconnect")
+
+    async def on_message(self, message):
+        if message.author == self.user:
+            return
+        await self.process_commands(message)
+
+    async def join_voice_channel(self, channel):
+        if self.voice_client:
+            await self.voice_client.disconnect()
+        self.voice_client = await channel.connect()
+        logger.info(f"Joined voice channel: {channel.name}")
+
+    def convert_discord_audio_to_parakeet(self, audio_bytes):
+        """Convert Discord 48kHz stereo PCM to 16kHz mono float32 for Parakeet."""
+        try:
+            # Discord audio is 48kHz, stereo, 16-bit PCM
+            # Convert bytes to int16 numpy array
+            audio_np = np.frombuffer(audio_bytes, dtype=np.int16)
+            
+            # Stereo to mono: average left and right channels
+            audio_np = audio_np.reshape(-1, 2).mean(axis=1).astype(np.int16)
+            
+            # Resample 48kHz to 16kHz (divide by 3)
+            audio_16k = audio_np[::3]
+            
+            # Convert int16 to float32 (normalize to [-1.0, 1.0])
+            audio_float = audio_16k.astype(np.float32) / 32768.0
+            
+            return audio_float
+        except Exception as e:
+            logger.error(f"Audio conversion error: {e}")
+            return None
+
+    async def record_audio(self, duration=5):
+        """Record audio from voice channel for specified duration."""
+        if not self.voice_client:
+            logger.warning("Not in voice channel")
+            return None
+        
+        self._recording = True
+        self._audio_buffer = bytearray()
+        
+        logger.info(f"Recording for {duration} seconds...")
+        start_time = asyncio.get_event_loop().time()
+        
+        while self._recording and (asyncio.get_event_loop().time() - start_time) < duration:
+            try:
+                # Try to get audio packet (non-blocking)
+                packet = await asyncio.wait_for(
+                    self.voice_client.receive(),
+                    timeout=0.1
+                )
+                if packet and hasattr(packet, 'data'):
+                    self._audio_buffer.extend(packet.data)
+            except asyncio.TimeoutError:
+                continue
+            except Exception as e:
+                logger.debug(f"Recv error: {e}")
+                continue
+        
+        self._recording = False
+        audio_data = bytes(self._audio_buffer)
+        logger.info(f"Recorded {len(audio_data)} bytes")
+        return audio_data
+
+    async def process_voice_command(self, ctx):
+        """Record, transcribe, get LLM response, and speak."""
+        await ctx.send("🎙️ Listening... (speak now)")
+        
+        # Record audio
+        start_time = asyncio.get_event_loop().time()
+        audio_bytes = await self.record_audio(duration=5)
+        record_time = asyncio.get_event_loop().time() - start_time
+        
+        if not audio_bytes or len(audio_bytes) < 1000:
+            await ctx.send("❌ No audio captured (too quiet or not in voice channel)")
+            return
+        
+        await ctx.send(f"📝 Transcribing ({len(audio_bytes)} bytes, {record_time:.1f}s)...")
+        
+        # Convert audio format
+        audio_float = self.convert_discord_audio_to_parakeet(audio_bytes)
+        if audio_float is None:
+            await ctx.send("❌ Audio conversion failed")
+            return
+        
+        # Transcribe with Parakeet
+        transcribe_start = asyncio.get_event_loop().time()
+        try:
+            # Run transcription in thread pool (it's CPU intensive)
+            loop = asyncio.get_event_loop()
+            text = await loop.run_in_executor(
+                None, 
+                lambda: parakeet_asr.transcribe(audio_float)
+            )
+            transcribe_time = asyncio.get_event_loop().time() - transcribe_start
+        except Exception as e:
+            logger.error(f"Transcription error: {e}")
+            await ctx.send(f"❌ Transcription failed: {e}")
+            return
+        
+        if not text or not text.strip():
+            await ctx.send("❌ No speech detected")
+            return
+        
+        await ctx.send(f"👤 You said: \"{text}\" ({transcribe_time:.1f}s)")
+        
+        # Get LLM response
+        llm_start = asyncio.get_event_loop().time()
+        response = ollama.generate(text)
+        llm_time = asyncio.get_event_loop().time() - llm_start
+        
+        if not response:
+            await ctx.send("❌ LLM failed to respond")
+            return
+        
+        await ctx.send(f"🤖 GLaDOS: \"{response}\" ({llm_time:.1f}s)")
+        
+        # Synthesize and speak
+        tts_start = asyncio.get_event_loop().time()
+        audio = await http_tts.synthesize(response)
+        tts_time = asyncio.get_event_loop().time() - tts_start
+        
+        if audio:
+            await self.play_audio(audio)
+            total_time = record_time + transcribe_time + llm_time + tts_time
+            await ctx.send(f"⏱️ Total latency: {total_time:.1f}s (rec: {record_time:.1f}, stt: {transcribe_time:.1f}, llm: {llm_time:.1f}, tts: {tts_time:.1f})")
+        else:
+            await ctx.send("❌ TTS failed")
+
+    async def play_audio(self, audio_bytes):
+        """Play audio in voice channel."""
+        if not self.voice_client:
+            logger.warning("Not connected to voice channel")
+            return False
+
+        if audio_bytes[:4] == b'RIFF':
+            suffix = '.wav'
+        else:
+            suffix = '.mp3'
+
+        # Create a temp file for FFmpeg
+        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as temp:
+            temp.write(audio_bytes)
+            temp_path = temp.name
+
+        try:
+            source = discord.FFmpegPCMAudio(temp_path)
+            if self.voice_client.is_playing():
+                self.voice_client.stop()
+            self.voice_client.play(source)
+            
+            # Wait for playback to finish
+            while self.voice_client.is_playing():
+                await asyncio.sleep(0.1)
+            return True
+        except Exception as e:
+            logger.error(f"Error playing audio: {e}")
+            return False
+        finally:
+            try:
+                os.unlink(temp_path)
+            except:
+                pass
+
+
+bot = VoiceBot()
+
+
+@bot.command(name='leave')
+async def leave(ctx):
+    """Leave voice channel."""
+    if bot.voice_client:
+        await bot.voice_client.disconnect()
+        bot.voice_client = None
+    await ctx.send("Left voice channel.")
+
+
+@bot.command(name='join')
+async def join(ctx):
+    """Join voice channel."""
+    if not ctx.author.voice:
+        await ctx.send("You need to be in a voice channel!")
+        return
+    channel = ctx.author.voice.channel
+    await bot.join_voice_channel(channel)
+    await ctx.send(f"Joined {channel.name}!")
+
+
+@bot.command(name='test')
+async def test(ctx, *, text="Hello! This is a test."):
+    """Test TTS."""
+    if not bot.voice_client:
+        await ctx.send("Not in voice channel! Use !join first.")
+        return
+    
+    await ctx.send(f"🎙️ Saying: {text}")
+    audio = await http_tts.synthesize(text)
+    if audio:
+        success = await bot.play_audio(audio)
+        if not success:
+            await ctx.send("Failed to play audio.")
+    else:
+        await ctx.send("TTS error.")
+
+
+@bot.command(name='say')
+async def say(ctx, *, text):
+    """Say text using TTS."""
+    await test(ctx, text=text)
+
+
+@bot.command(name='listen')
+async def listen(ctx):
+    """Record voice for 5 seconds, transcribe, and respond."""
+    if not bot.voice_client:
+        await ctx.send("Not in voice channel! Use !join first.")
+        return
+    
+    if not parakeet_asr:
+        await ctx.send("❌ Parakeet ASR not available. Check GLaDOS installation.")
+        return
+    
+    await bot.process_voice_command(ctx)
+
+
+@bot.command(name='ask')
+async def ask(ctx, *, question):
+    """Ask the LLM something (text only, for now)."""
+    await ctx.send("🤔 Thinking...")
+    response = ollama.generate(question)
+    if response:
+        await ctx.send(f"💬 {response}")
+        # Also speak it if in voice channel
+        if bot.voice_client:
+            audio = await http_tts.synthesize(response)
+            if audio:
+                await bot.play_audio(audio)
+    else:
+        await ctx.send("Failed to get response.")
+
+
+async def main():
+    token = config['discord']['token']
+    if token.startswith("YOUR_"):
+        logger.error("Configure Discord token in config.yaml!")
+        return
+
+    logger.info("Starting Discord bot...")
+    await bot.start(token)
+
+
+if __name__ == '__main__':
+    asyncio.run(main())