openclaw-workspace/tools/youtube-summarizer.py

#!/usr/bin/env python3
"""
YouTube Video Summarizer
Extracts transcripts and generates bullet summaries with timestamps
"""

import sys
import re
import json
import subprocess
import tempfile
import os
from urllib.parse import urlparse, parse_qs

def extract_video_id(url):
    """Extract YouTube video ID from various URL formats"""
    patterns = [
        r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
        r'youtube\.com/watch\?.*v=([a-zA-Z0-9_-]{11})',
        r'youtube\.com/shorts/([a-zA-Z0-9_-]{11})',
    ]

    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def get_transcript_yt_dlp(video_id):
    """Get transcript using yt-dlp"""
    try:
        # Use system temp directory (cross-platform)
        temp_dir = tempfile.gettempdir()
        base_path = os.path.join(temp_dir, f'yt_{video_id}')

        result = subprocess.run(
            ['yt-dlp', '--write-auto-sub', '--skip-download',
             '--sub-langs', 'en,en-en', '--convert-subs', 'srt',
             '-o', base_path, f'https://www.youtube.com/watch?v={video_id}'],
            capture_output=True,
            text=True,
            timeout=60
        )

        # Read subtitle file if created - try multiple formats
        sub_files = [
            f'{base_path}.en.srt',
            f'{base_path}.en.vtt',
            f'{base_path}.en-en.srt',
            f'{base_path}.en-en.vtt'
        ]
        for sub_file in sub_files:
            try:
                with open(sub_file, 'r', encoding='utf-8') as f:
                    return f.read()
            except FileNotFoundError:
                continue
        return None
    except Exception as e:
        return f"Error: {e}"

def parse_srt(srt_content):
    """Parse SRT content into text with timestamps"""
    if not srt_content:
        return []

    entries = []
    blocks = srt_content.strip().split('\n\n')

    for block in blocks:
        lines = block.split('\n')
        if len(lines) >= 3:
            # Line 1: index number
            # Line 2: timestamp (00:00:00,000 --> 00:00:05,000)
            # Line 3+: text
            timestamp_line = lines[1]
            text = ' '.join(lines[2:])

            # Extract start time
            start_time = timestamp_line.split(' --> ')[0].replace(',', '.')

            entries.append({
                'time': start_time,
                'text': text
            })

    return entries

def chunk_and_summarize(entries, chunk_size=2000):
    """Chunk transcript and return summary format"""
    if not entries:
        return "No transcript available for this video."

    full_text = ' '.join([e['text'] for e in entries])

    # Sample every Nth entry for key points
    sampled = entries[::max(1, len(entries)//10)]

    result = []
    result.append("## Transcript Summary")
    result.append("")
    result.append(f"**Total entries:** {len(entries)}")
    result.append(f"**Duration:** ~{entries[-1]['time'][:5] if entries else 'Unknown'}")
    result.append("")
    result.append("### Key Points with Timestamps")
    result.append("")

    for entry in sampled[:15]:  # Top 15 samples
        time = entry['time'][:5]  # MM:SS
        text = entry['text'][:100]  # First 100 chars
        result.append(f"- **{time}** -- {text}...")

    result.append("")
    result.append("### Full Context (first 2000 chars)")
    result.append(full_text[:2000] + "..." if len(full_text) > 2000 else full_text)

    return "\n".join(result)

def main():
    # Fix Windows encoding issues
    import sys
    sys.stdout.reconfigure(encoding='utf-8')
    if len(sys.argv) < 2:
        print("Usage: youtube-summarizer.py <youtube_url>")
        sys.exit(1)

    url = sys.argv[1]
    video_id = extract_video_id(url)

    if not video_id:
        print(f"ERROR: Could not extract video ID from: {url}")
        print("Supported formats: youtube.com/watch?v=ID, youtu.be/ID, youtube.com/shorts/ID")
        sys.exit(1)

    print(f"Processing video: {video_id}")

    srt_content = get_transcript_yt_dlp(video_id)

    if srt_content and not srt_content.startswith("Error"):
        entries = parse_srt(srt_content)
        summary = chunk_and_summarize(entries)
        print(summary)
    elif srt_content is None:
        print(f"No transcript available for video: {video_id}")
        print("This video may not have auto-generated captions, or they may be restricted.")
        print("Try a different video with visible captions enabled.")
    else:
        print(f"ERROR: Failed to download transcript")
        print(f"Details: {srt_content}")

if __name__ == '__main__':
    main()