Add youtube transcript collection skill:
Co-authored-by: UfukNode <ufuk@crivacy.io>
This commit is contained in:
parent
9eb4a4a481
commit
9cc2cf3241
3 changed files with 184 additions and 0 deletions
1
skills/media/DESCRIPTION.md
Normal file
1
skills/media/DESCRIPTION.md
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
Media content extraction and transformation tools — YouTube transcripts, audio, video processing.
|
||||||
71
skills/media/youtube-content/SKILL.md
Normal file
71
skills/media/youtube-content/SKILL.md
Normal file
|
|
@ -0,0 +1,71 @@
|
||||||
|
---
|
||||||
|
name: youtube-content
|
||||||
|
description: Fetch YouTube video transcripts and transform them into structured content (chapters, summaries, threads, blog posts).
|
||||||
|
---
|
||||||
|
|
||||||
|
# YouTube Content Tool
|
||||||
|
|
||||||
|
Extract transcripts from YouTube videos and convert them into useful formats.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install youtube-transcript-api
|
||||||
|
```
|
||||||
|
|
||||||
|
## Helper script
|
||||||
|
|
||||||
|
This skill includes `fetch_transcript.py` — use it to fetch transcripts quickly:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# JSON output with metadata
|
||||||
|
python3 SKILL_DIR/scripts/fetch_transcript.py "https://youtube.com/watch?v=VIDEO_ID"
|
||||||
|
|
||||||
|
# With timestamps
|
||||||
|
python3 SKILL_DIR/scripts/fetch_transcript.py "https://youtube.com/watch?v=VIDEO_ID" --timestamps
|
||||||
|
|
||||||
|
# Plain text output (good for piping into further processing)
|
||||||
|
python3 SKILL_DIR/scripts/fetch_transcript.py "https://youtube.com/watch?v=VIDEO_ID" --text-only
|
||||||
|
|
||||||
|
# Specific language with fallback
|
||||||
|
python3 SKILL_DIR/scripts/fetch_transcript.py "https://youtube.com/watch?v=VIDEO_ID" --language tr,en
|
||||||
|
|
||||||
|
# Timestamped plain text
|
||||||
|
python3 SKILL_DIR/scripts/fetch_transcript.py "https://youtube.com/watch?v=VIDEO_ID" --text-only --timestamps
|
||||||
|
```
|
||||||
|
|
||||||
|
`SKILL_DIR` is the directory containing this SKILL.md file.
|
||||||
|
|
||||||
|
## URL formats supported
|
||||||
|
|
||||||
|
The script accepts any of these formats (or a raw 11-character video ID):
|
||||||
|
|
||||||
|
- `https://www.youtube.com/watch?v=VIDEO_ID`
|
||||||
|
- `https://youtu.be/VIDEO_ID`
|
||||||
|
- `https://youtube.com/shorts/VIDEO_ID`
|
||||||
|
- `https://youtube.com/embed/VIDEO_ID`
|
||||||
|
- `https://youtube.com/live/VIDEO_ID`
|
||||||
|
|
||||||
|
## Output formats
|
||||||
|
|
||||||
|
After fetching the transcript, format it based on what the user asks for:
|
||||||
|
|
||||||
|
- **Chapters**: Group by topic shifts, output timestamped chapter list (`00:00 Introduction`, `03:45 Main Topic`, etc.)
|
||||||
|
- **Summary**: Concise 5-10 sentence overview of the entire video
|
||||||
|
- **Chapter summaries**: Chapters with a short paragraph summary for each
|
||||||
|
- **Thread**: Twitter/X thread format — numbered posts, each under 280 chars
|
||||||
|
- **Blog post**: Full article with title, sections, and key takeaways
|
||||||
|
- **Quotes**: Notable quotes with timestamps
|
||||||
|
|
||||||
|
## Workflow
|
||||||
|
|
||||||
|
1. Fetch the transcript using the helper script
|
||||||
|
2. If the transcript is very long (>50K chars), summarize in chunks
|
||||||
|
3. Transform into the requested output format using your own reasoning
|
||||||
|
|
||||||
|
## Error handling
|
||||||
|
|
||||||
|
- **Transcript disabled**: Some videos have transcripts turned off — tell the user
|
||||||
|
- **Private/unavailable**: The API will raise an error — relay it clearly
|
||||||
|
- **No matching language**: Try without specifying a language to get whatever's available
|
||||||
|
- **Dependency missing**: Run `pip install youtube-transcript-api` first
|
||||||
112
skills/media/youtube-content/scripts/fetch_transcript.py
Normal file
112
skills/media/youtube-content/scripts/fetch_transcript.py
Normal file
|
|
@ -0,0 +1,112 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Fetch a YouTube video transcript and output it as structured JSON.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python fetch_transcript.py <url_or_video_id> [--language en,tr] [--timestamps]
|
||||||
|
|
||||||
|
Output (JSON):
|
||||||
|
{
|
||||||
|
"video_id": "...",
|
||||||
|
"language": "en",
|
||||||
|
"segments": [{"text": "...", "start": 0.0, "duration": 2.5}, ...],
|
||||||
|
"full_text": "complete transcript as plain text",
|
||||||
|
"timestamped_text": "00:00 first line\n00:05 second line\n..."
|
||||||
|
}
|
||||||
|
|
||||||
|
Install dependency: pip install youtube-transcript-api
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def extract_video_id(url_or_id: str) -> str:
|
||||||
|
"""Extract the 11-character video ID from various YouTube URL formats."""
|
||||||
|
url_or_id = url_or_id.strip()
|
||||||
|
patterns = [
|
||||||
|
r'(?:v=|youtu\.be/|shorts/|embed/|live/)([a-zA-Z0-9_-]{11})',
|
||||||
|
r'^([a-zA-Z0-9_-]{11})$',
|
||||||
|
]
|
||||||
|
for pattern in patterns:
|
||||||
|
match = re.search(pattern, url_or_id)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
return url_or_id
|
||||||
|
|
||||||
|
|
||||||
|
def format_timestamp(seconds: float) -> str:
|
||||||
|
"""Convert seconds to HH:MM:SS or MM:SS format."""
|
||||||
|
total = int(seconds)
|
||||||
|
h, remainder = divmod(total, 3600)
|
||||||
|
m, s = divmod(remainder, 60)
|
||||||
|
if h > 0:
|
||||||
|
return f"{h}:{m:02d}:{s:02d}"
|
||||||
|
return f"{m}:{s:02d}"
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_transcript(video_id: str, languages: list = None):
|
||||||
|
"""Fetch transcript segments from YouTube."""
|
||||||
|
try:
|
||||||
|
from youtube_transcript_api import YouTubeTranscriptApi
|
||||||
|
except ImportError:
|
||||||
|
print("Error: youtube-transcript-api not installed. Run: pip install youtube-transcript-api",
|
||||||
|
file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if languages:
|
||||||
|
return YouTubeTranscriptApi.get_transcript(video_id, languages=languages)
|
||||||
|
return YouTubeTranscriptApi.get_transcript(video_id)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Fetch YouTube transcript as JSON")
|
||||||
|
parser.add_argument("url", help="YouTube URL or video ID")
|
||||||
|
parser.add_argument("--language", "-l", default=None,
|
||||||
|
help="Comma-separated language codes (e.g. en,tr). Default: auto")
|
||||||
|
parser.add_argument("--timestamps", "-t", action="store_true",
|
||||||
|
help="Include timestamped text in output")
|
||||||
|
parser.add_argument("--text-only", action="store_true",
|
||||||
|
help="Output plain text instead of JSON")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
video_id = extract_video_id(args.url)
|
||||||
|
languages = [l.strip() for l in args.language.split(",")] if args.language else None
|
||||||
|
|
||||||
|
try:
|
||||||
|
segments = fetch_transcript(video_id, languages)
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = str(e)
|
||||||
|
if "disabled" in error_msg.lower():
|
||||||
|
print(json.dumps({"error": "Transcripts are disabled for this video."}))
|
||||||
|
elif "no transcript" in error_msg.lower():
|
||||||
|
print(json.dumps({"error": f"No transcript found. Try specifying a language with --language."}))
|
||||||
|
else:
|
||||||
|
print(json.dumps({"error": error_msg}))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
full_text = " ".join(seg["text"] for seg in segments)
|
||||||
|
timestamped = "\n".join(
|
||||||
|
f"{format_timestamp(seg['start'])} {seg['text']}" for seg in segments
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.text_only:
|
||||||
|
print(timestamped if args.timestamps else full_text)
|
||||||
|
return
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"video_id": video_id,
|
||||||
|
"segment_count": len(segments),
|
||||||
|
"duration": format_timestamp(segments[-1]["start"] + segments[-1]["duration"]) if segments else "0:00",
|
||||||
|
"full_text": full_text,
|
||||||
|
}
|
||||||
|
if args.timestamps:
|
||||||
|
result["timestamped_text"] = timestamped
|
||||||
|
|
||||||
|
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue