Files
openclaw-workspace/tools/youtube-summarizer.py
2026-04-11 09:45:12 -05:00

153 lines
4.7 KiB
Python

#!/usr/bin/env python3
"""
YouTube Video Summarizer
Extracts transcripts and generates bullet summaries with timestamps
"""
import sys
import re
import json
import subprocess
import tempfile
import os
from urllib.parse import urlparse, parse_qs
def extract_video_id(url):
"""Extract YouTube video ID from various URL formats"""
patterns = [
r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
r'youtube\.com/watch\?.*v=([a-zA-Z0-9_-]{11})',
r'youtube\.com/shorts/([a-zA-Z0-9_-]{11})',
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def get_transcript_yt_dlp(video_id):
"""Get transcript using yt-dlp"""
try:
# Use system temp directory (cross-platform)
temp_dir = tempfile.gettempdir()
base_path = os.path.join(temp_dir, f'yt_{video_id}')
result = subprocess.run(
['yt-dlp', '--write-auto-sub', '--skip-download',
'--sub-langs', 'en,en-en', '--convert-subs', 'srt',
'-o', base_path, f'https://www.youtube.com/watch?v={video_id}'],
capture_output=True,
text=True,
timeout=60
)
# Read subtitle file if created - try multiple formats
sub_files = [
f'{base_path}.en.srt',
f'{base_path}.en.vtt',
f'{base_path}.en-en.srt',
f'{base_path}.en-en.vtt'
]
for sub_file in sub_files:
try:
with open(sub_file, 'r', encoding='utf-8') as f:
return f.read()
except FileNotFoundError:
continue
return None
except Exception as e:
return f"Error: {e}"
def parse_srt(srt_content):
"""Parse SRT content into text with timestamps"""
if not srt_content:
return []
entries = []
blocks = srt_content.strip().split('\n\n')
for block in blocks:
lines = block.split('\n')
if len(lines) >= 3:
# Line 1: index number
# Line 2: timestamp (00:00:00,000 --> 00:00:05,000)
# Line 3+: text
timestamp_line = lines[1]
text = ' '.join(lines[2:])
# Extract start time
start_time = timestamp_line.split(' --> ')[0].replace(',', '.')
entries.append({
'time': start_time,
'text': text
})
return entries
def chunk_and_summarize(entries, chunk_size=2000):
"""Chunk transcript and return summary format"""
if not entries:
return "No transcript available for this video."
full_text = ' '.join([e['text'] for e in entries])
# Sample every Nth entry for key points
sampled = entries[::max(1, len(entries)//10)]
result = []
result.append("## Transcript Summary")
result.append("")
result.append(f"**Total entries:** {len(entries)}")
result.append(f"**Duration:** ~{entries[-1]['time'][:5] if entries else 'Unknown'}")
result.append("")
result.append("### Key Points with Timestamps")
result.append("")
for entry in sampled[:15]: # Top 15 samples
time = entry['time'][:5] # MM:SS
text = entry['text'][:100] # First 100 chars
result.append(f"- **{time}** -- {text}...")
result.append("")
result.append("### Full Context (first 2000 chars)")
result.append(full_text[:2000] + "..." if len(full_text) > 2000 else full_text)
return "\n".join(result)
def main():
# Fix Windows encoding issues
import sys
sys.stdout.reconfigure(encoding='utf-8')
if len(sys.argv) < 2:
print("Usage: youtube-summarizer.py <youtube_url>")
sys.exit(1)
url = sys.argv[1]
video_id = extract_video_id(url)
if not video_id:
print(f"ERROR: Could not extract video ID from: {url}")
print("Supported formats: youtube.com/watch?v=ID, youtu.be/ID, youtube.com/shorts/ID")
sys.exit(1)
print(f"Processing video: {video_id}")
srt_content = get_transcript_yt_dlp(video_id)
if srt_content and not srt_content.startswith("Error"):
entries = parse_srt(srt_content)
summary = chunk_and_summarize(entries)
print(summary)
elif srt_content is None:
print(f"No transcript available for video: {video_id}")
print("This video may not have auto-generated captions, or they may be restricted.")
print("Try a different video with visible captions enabled.")
else:
print(f"ERROR: Failed to download transcript")
print(f"Details: {srt_content}")
if __name__ == '__main__':
main()