153 lines
4.7 KiB
Python
153 lines
4.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
YouTube Video Summarizer
|
|
Extracts transcripts and generates bullet summaries with timestamps
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
import json
|
|
import subprocess
|
|
import tempfile
|
|
import os
|
|
from urllib.parse import urlparse, parse_qs
|
|
|
|
def extract_video_id(url):
|
|
"""Extract YouTube video ID from various URL formats"""
|
|
patterns = [
|
|
r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
|
|
r'youtube\.com/watch\?.*v=([a-zA-Z0-9_-]{11})',
|
|
r'youtube\.com/shorts/([a-zA-Z0-9_-]{11})',
|
|
]
|
|
|
|
for pattern in patterns:
|
|
match = re.search(pattern, url)
|
|
if match:
|
|
return match.group(1)
|
|
return None
|
|
|
|
def get_transcript_yt_dlp(video_id):
|
|
"""Get transcript using yt-dlp"""
|
|
try:
|
|
# Use system temp directory (cross-platform)
|
|
temp_dir = tempfile.gettempdir()
|
|
base_path = os.path.join(temp_dir, f'yt_{video_id}')
|
|
|
|
result = subprocess.run(
|
|
['yt-dlp', '--write-auto-sub', '--skip-download',
|
|
'--sub-langs', 'en,en-en', '--convert-subs', 'srt',
|
|
'-o', base_path, f'https://www.youtube.com/watch?v={video_id}'],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=60
|
|
)
|
|
|
|
# Read subtitle file if created - try multiple formats
|
|
sub_files = [
|
|
f'{base_path}.en.srt',
|
|
f'{base_path}.en.vtt',
|
|
f'{base_path}.en-en.srt',
|
|
f'{base_path}.en-en.vtt'
|
|
]
|
|
for sub_file in sub_files:
|
|
try:
|
|
with open(sub_file, 'r', encoding='utf-8') as f:
|
|
return f.read()
|
|
except FileNotFoundError:
|
|
continue
|
|
return None
|
|
except Exception as e:
|
|
return f"Error: {e}"
|
|
|
|
def parse_srt(srt_content):
|
|
"""Parse SRT content into text with timestamps"""
|
|
if not srt_content:
|
|
return []
|
|
|
|
entries = []
|
|
blocks = srt_content.strip().split('\n\n')
|
|
|
|
for block in blocks:
|
|
lines = block.split('\n')
|
|
if len(lines) >= 3:
|
|
# Line 1: index number
|
|
# Line 2: timestamp (00:00:00,000 --> 00:00:05,000)
|
|
# Line 3+: text
|
|
timestamp_line = lines[1]
|
|
text = ' '.join(lines[2:])
|
|
|
|
# Extract start time
|
|
start_time = timestamp_line.split(' --> ')[0].replace(',', '.')
|
|
|
|
entries.append({
|
|
'time': start_time,
|
|
'text': text
|
|
})
|
|
|
|
return entries
|
|
|
|
def chunk_and_summarize(entries, chunk_size=2000):
|
|
"""Chunk transcript and return summary format"""
|
|
if not entries:
|
|
return "No transcript available for this video."
|
|
|
|
full_text = ' '.join([e['text'] for e in entries])
|
|
|
|
# Sample every Nth entry for key points
|
|
sampled = entries[::max(1, len(entries)//10)]
|
|
|
|
result = []
|
|
result.append("## Transcript Summary")
|
|
result.append("")
|
|
result.append(f"**Total entries:** {len(entries)}")
|
|
result.append(f"**Duration:** ~{entries[-1]['time'][:5] if entries else 'Unknown'}")
|
|
result.append("")
|
|
result.append("### Key Points with Timestamps")
|
|
result.append("")
|
|
|
|
for entry in sampled[:15]: # Top 15 samples
|
|
time = entry['time'][:5] # MM:SS
|
|
text = entry['text'][:100] # First 100 chars
|
|
result.append(f"- **{time}** -- {text}...")
|
|
|
|
result.append("")
|
|
result.append("### Full Context (first 2000 chars)")
|
|
result.append(full_text[:2000] + "..." if len(full_text) > 2000 else full_text)
|
|
|
|
return "\n".join(result)
|
|
|
|
def main():
|
|
# Fix Windows encoding issues
|
|
import sys
|
|
sys.stdout.reconfigure(encoding='utf-8')
|
|
if len(sys.argv) < 2:
|
|
print("Usage: youtube-summarizer.py <youtube_url>")
|
|
sys.exit(1)
|
|
|
|
url = sys.argv[1]
|
|
video_id = extract_video_id(url)
|
|
|
|
if not video_id:
|
|
print(f"ERROR: Could not extract video ID from: {url}")
|
|
print("Supported formats: youtube.com/watch?v=ID, youtu.be/ID, youtube.com/shorts/ID")
|
|
sys.exit(1)
|
|
|
|
print(f"Processing video: {video_id}")
|
|
|
|
srt_content = get_transcript_yt_dlp(video_id)
|
|
|
|
if srt_content and not srt_content.startswith("Error"):
|
|
entries = parse_srt(srt_content)
|
|
summary = chunk_and_summarize(entries)
|
|
print(summary)
|
|
elif srt_content is None:
|
|
print(f"No transcript available for video: {video_id}")
|
|
print("This video may not have auto-generated captions, or they may be restricted.")
|
|
print("Try a different video with visible captions enabled.")
|
|
else:
|
|
print(f"ERROR: Failed to download transcript")
|
|
print(f"Details: {srt_content}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|