Fresh start - excluded large ROM JSON files

2026-04-11 09:45:12 -05:00
commit 5deb387aa6
395 changed files with 47744 additions and 0 deletions
--- a/tools/youtube-summarizer.py
+++ b/tools/youtube-summarizer.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""
+YouTube Video Summarizer
+Extracts transcripts and generates bullet summaries with timestamps
+"""
+
+import sys
+import re
+import json
+import subprocess
+import tempfile
+import os
+from urllib.parse import urlparse, parse_qs
+
+def extract_video_id(url):
+    """Extract YouTube video ID from various URL formats"""
+    patterns = [
+        r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
+        r'youtube\.com/watch\?.*v=([a-zA-Z0-9_-]{11})',
+        r'youtube\.com/shorts/([a-zA-Z0-9_-]{11})',
+    ]
+    
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    return None
+
+def get_transcript_yt_dlp(video_id):
+    """Get transcript using yt-dlp"""
+    try:
+        # Use system temp directory (cross-platform)
+        temp_dir = tempfile.gettempdir()
+        base_path = os.path.join(temp_dir, f'yt_{video_id}')
+        
+        result = subprocess.run(
+            ['yt-dlp', '--write-auto-sub', '--skip-download', 
+             '--sub-langs', 'en,en-en', '--convert-subs', 'srt',
+             '-o', base_path, f'https://www.youtube.com/watch?v={video_id}'],
+            capture_output=True,
+            text=True,
+            timeout=60
+        )
+        
+        # Read subtitle file if created - try multiple formats
+        sub_files = [
+            f'{base_path}.en.srt',
+            f'{base_path}.en.vtt', 
+            f'{base_path}.en-en.srt',
+            f'{base_path}.en-en.vtt'
+        ]
+        for sub_file in sub_files:
+            try:
+                with open(sub_file, 'r', encoding='utf-8') as f:
+                    return f.read()
+            except FileNotFoundError:
+                continue
+        return None
+    except Exception as e:
+        return f"Error: {e}"
+
+def parse_srt(srt_content):
+    """Parse SRT content into text with timestamps"""
+    if not srt_content:
+        return []
+    
+    entries = []
+    blocks = srt_content.strip().split('\n\n')
+    
+    for block in blocks:
+        lines = block.split('\n')
+        if len(lines) >= 3:
+            # Line 1: index number
+            # Line 2: timestamp (00:00:00,000 --> 00:00:05,000)
+            # Line 3+: text
+            timestamp_line = lines[1]
+            text = ' '.join(lines[2:])
+            
+            # Extract start time
+            start_time = timestamp_line.split(' --> ')[0].replace(',', '.')
+            
+            entries.append({
+                'time': start_time,
+                'text': text
+            })
+    
+    return entries
+
+def chunk_and_summarize(entries, chunk_size=2000):
+    """Chunk transcript and return summary format"""
+    if not entries:
+        return "No transcript available for this video."
+    
+    full_text = ' '.join([e['text'] for e in entries])
+    
+    # Sample every Nth entry for key points
+    sampled = entries[::max(1, len(entries)//10)]
+    
+    result = []
+    result.append("## Transcript Summary")
+    result.append("")
+    result.append(f"**Total entries:** {len(entries)}")
+    result.append(f"**Duration:** ~{entries[-1]['time'][:5] if entries else 'Unknown'}")
+    result.append("")
+    result.append("### Key Points with Timestamps")
+    result.append("")
+    
+    for entry in sampled[:15]:  # Top 15 samples
+        time = entry['time'][:5]  # MM:SS
+        text = entry['text'][:100]  # First 100 chars
+        result.append(f"- **{time}** -- {text}...")
+    
+    result.append("")
+    result.append("### Full Context (first 2000 chars)")
+    result.append(full_text[:2000] + "..." if len(full_text) > 2000 else full_text)
+    
+    return "\n".join(result)
+
+def main():
+    # Fix Windows encoding issues
+    import sys
+    sys.stdout.reconfigure(encoding='utf-8')
+    if len(sys.argv) < 2:
+        print("Usage: youtube-summarizer.py <youtube_url>")
+        sys.exit(1)
+    
+    url = sys.argv[1]
+    video_id = extract_video_id(url)
+    
+    if not video_id:
+        print(f"ERROR: Could not extract video ID from: {url}")
+        print("Supported formats: youtube.com/watch?v=ID, youtu.be/ID, youtube.com/shorts/ID")
+        sys.exit(1)
+    
+    print(f"Processing video: {video_id}")
+    
+    srt_content = get_transcript_yt_dlp(video_id)
+    
+    if srt_content and not srt_content.startswith("Error"):
+        entries = parse_srt(srt_content)
+        summary = chunk_and_summarize(entries)
+        print(summary)
+    elif srt_content is None:
+        print(f"No transcript available for video: {video_id}")
+        print("This video may not have auto-generated captions, or they may be restricted.")
+        print("Try a different video with visible captions enabled.")
+    else:
+        print(f"ERROR: Failed to download transcript")
+        print(f"Details: {srt_content}")
+
+if __name__ == '__main__':
+    main()