#!/usr/bin/env python3 """ YouTube Video Summarizer Extracts transcripts and generates bullet summaries with timestamps """ import sys import re import json import subprocess import tempfile import os from urllib.parse import urlparse, parse_qs def extract_video_id(url): """Extract YouTube video ID from various URL formats""" patterns = [ r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})', r'youtube\.com/watch\?.*v=([a-zA-Z0-9_-]{11})', r'youtube\.com/shorts/([a-zA-Z0-9_-]{11})', ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1) return None def get_transcript_yt_dlp(video_id): """Get transcript using yt-dlp""" try: # Use system temp directory (cross-platform) temp_dir = tempfile.gettempdir() base_path = os.path.join(temp_dir, f'yt_{video_id}') result = subprocess.run( ['yt-dlp', '--write-auto-sub', '--skip-download', '--sub-langs', 'en,en-en', '--convert-subs', 'srt', '-o', base_path, f'https://www.youtube.com/watch?v={video_id}'], capture_output=True, text=True, timeout=60 ) # Read subtitle file if created - try multiple formats sub_files = [ f'{base_path}.en.srt', f'{base_path}.en.vtt', f'{base_path}.en-en.srt', f'{base_path}.en-en.vtt' ] for sub_file in sub_files: try: with open(sub_file, 'r', encoding='utf-8') as f: return f.read() except FileNotFoundError: continue return None except Exception as e: return f"Error: {e}" def parse_srt(srt_content): """Parse SRT content into text with timestamps""" if not srt_content: return [] entries = [] blocks = srt_content.strip().split('\n\n') for block in blocks: lines = block.split('\n') if len(lines) >= 3: # Line 1: index number # Line 2: timestamp (00:00:00,000 --> 00:00:05,000) # Line 3+: text timestamp_line = lines[1] text = ' '.join(lines[2:]) # Extract start time start_time = timestamp_line.split(' --> ')[0].replace(',', '.') entries.append({ 'time': start_time, 'text': text }) return entries def chunk_and_summarize(entries, chunk_size=2000): """Chunk transcript and return summary format""" if not entries: return "No transcript available for this video." full_text = ' '.join([e['text'] for e in entries]) # Sample every Nth entry for key points sampled = entries[::max(1, len(entries)//10)] result = [] result.append("## Transcript Summary") result.append("") result.append(f"**Total entries:** {len(entries)}") result.append(f"**Duration:** ~{entries[-1]['time'][:5] if entries else 'Unknown'}") result.append("") result.append("### Key Points with Timestamps") result.append("") for entry in sampled[:15]: # Top 15 samples time = entry['time'][:5] # MM:SS text = entry['text'][:100] # First 100 chars result.append(f"- **{time}** -- {text}...") result.append("") result.append("### Full Context (first 2000 chars)") result.append(full_text[:2000] + "..." if len(full_text) > 2000 else full_text) return "\n".join(result) def main(): # Fix Windows encoding issues import sys sys.stdout.reconfigure(encoding='utf-8') if len(sys.argv) < 2: print("Usage: youtube-summarizer.py ") sys.exit(1) url = sys.argv[1] video_id = extract_video_id(url) if not video_id: print(f"ERROR: Could not extract video ID from: {url}") print("Supported formats: youtube.com/watch?v=ID, youtu.be/ID, youtube.com/shorts/ID") sys.exit(1) print(f"Processing video: {video_id}") srt_content = get_transcript_yt_dlp(video_id) if srt_content and not srt_content.startswith("Error"): entries = parse_srt(srt_content) summary = chunk_and_summarize(entries) print(summary) elif srt_content is None: print(f"No transcript available for video: {video_id}") print("This video may not have auto-generated captions, or they may be restricted.") print("Try a different video with visible captions enabled.") else: print(f"ERROR: Failed to download transcript") print(f"Details: {srt_content}") if __name__ == '__main__': main()