Fresh start - excluded large ROM JSON files
This commit is contained in:
152
tools/youtube-summarizer.py
Normal file
152
tools/youtube-summarizer.py
Normal file
@@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
YouTube Video Summarizer
|
||||
Extracts transcripts and generates bullet summaries with timestamps
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import json
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
|
||||
def extract_video_id(url):
|
||||
"""Extract YouTube video ID from various URL formats"""
|
||||
patterns = [
|
||||
r'(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})',
|
||||
r'youtube\.com/watch\?.*v=([a-zA-Z0-9_-]{11})',
|
||||
r'youtube\.com/shorts/([a-zA-Z0-9_-]{11})',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, url)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
def get_transcript_yt_dlp(video_id):
|
||||
"""Get transcript using yt-dlp"""
|
||||
try:
|
||||
# Use system temp directory (cross-platform)
|
||||
temp_dir = tempfile.gettempdir()
|
||||
base_path = os.path.join(temp_dir, f'yt_{video_id}')
|
||||
|
||||
result = subprocess.run(
|
||||
['yt-dlp', '--write-auto-sub', '--skip-download',
|
||||
'--sub-langs', 'en,en-en', '--convert-subs', 'srt',
|
||||
'-o', base_path, f'https://www.youtube.com/watch?v={video_id}'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60
|
||||
)
|
||||
|
||||
# Read subtitle file if created - try multiple formats
|
||||
sub_files = [
|
||||
f'{base_path}.en.srt',
|
||||
f'{base_path}.en.vtt',
|
||||
f'{base_path}.en-en.srt',
|
||||
f'{base_path}.en-en.vtt'
|
||||
]
|
||||
for sub_file in sub_files:
|
||||
try:
|
||||
with open(sub_file, 'r', encoding='utf-8') as f:
|
||||
return f.read()
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
return None
|
||||
except Exception as e:
|
||||
return f"Error: {e}"
|
||||
|
||||
def parse_srt(srt_content):
|
||||
"""Parse SRT content into text with timestamps"""
|
||||
if not srt_content:
|
||||
return []
|
||||
|
||||
entries = []
|
||||
blocks = srt_content.strip().split('\n\n')
|
||||
|
||||
for block in blocks:
|
||||
lines = block.split('\n')
|
||||
if len(lines) >= 3:
|
||||
# Line 1: index number
|
||||
# Line 2: timestamp (00:00:00,000 --> 00:00:05,000)
|
||||
# Line 3+: text
|
||||
timestamp_line = lines[1]
|
||||
text = ' '.join(lines[2:])
|
||||
|
||||
# Extract start time
|
||||
start_time = timestamp_line.split(' --> ')[0].replace(',', '.')
|
||||
|
||||
entries.append({
|
||||
'time': start_time,
|
||||
'text': text
|
||||
})
|
||||
|
||||
return entries
|
||||
|
||||
def chunk_and_summarize(entries, chunk_size=2000):
|
||||
"""Chunk transcript and return summary format"""
|
||||
if not entries:
|
||||
return "No transcript available for this video."
|
||||
|
||||
full_text = ' '.join([e['text'] for e in entries])
|
||||
|
||||
# Sample every Nth entry for key points
|
||||
sampled = entries[::max(1, len(entries)//10)]
|
||||
|
||||
result = []
|
||||
result.append("## Transcript Summary")
|
||||
result.append("")
|
||||
result.append(f"**Total entries:** {len(entries)}")
|
||||
result.append(f"**Duration:** ~{entries[-1]['time'][:5] if entries else 'Unknown'}")
|
||||
result.append("")
|
||||
result.append("### Key Points with Timestamps")
|
||||
result.append("")
|
||||
|
||||
for entry in sampled[:15]: # Top 15 samples
|
||||
time = entry['time'][:5] # MM:SS
|
||||
text = entry['text'][:100] # First 100 chars
|
||||
result.append(f"- **{time}** -- {text}...")
|
||||
|
||||
result.append("")
|
||||
result.append("### Full Context (first 2000 chars)")
|
||||
result.append(full_text[:2000] + "..." if len(full_text) > 2000 else full_text)
|
||||
|
||||
return "\n".join(result)
|
||||
|
||||
def main():
|
||||
# Fix Windows encoding issues
|
||||
import sys
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: youtube-summarizer.py <youtube_url>")
|
||||
sys.exit(1)
|
||||
|
||||
url = sys.argv[1]
|
||||
video_id = extract_video_id(url)
|
||||
|
||||
if not video_id:
|
||||
print(f"ERROR: Could not extract video ID from: {url}")
|
||||
print("Supported formats: youtube.com/watch?v=ID, youtu.be/ID, youtube.com/shorts/ID")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Processing video: {video_id}")
|
||||
|
||||
srt_content = get_transcript_yt_dlp(video_id)
|
||||
|
||||
if srt_content and not srt_content.startswith("Error"):
|
||||
entries = parse_srt(srt_content)
|
||||
summary = chunk_and_summarize(entries)
|
||||
print(summary)
|
||||
elif srt_content is None:
|
||||
print(f"No transcript available for video: {video_id}")
|
||||
print("This video may not have auto-generated captions, or they may be restricted.")
|
||||
print("Try a different video with visible captions enabled.")
|
||||
else:
|
||||
print(f"ERROR: Failed to download transcript")
|
||||
print(f"Details: {srt_content}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user