Add ASCII video skill to creative category
This commit is contained in:
parent
8eefbef91c
commit
c358af7861
9 changed files with 4729 additions and 0 deletions
435
skills/creative/ascii-video/references/optimization.md
Normal file
435
skills/creative/ascii-video/references/optimization.md
Normal file
|
|
@ -0,0 +1,435 @@
|
|||
# Optimization Reference
|
||||
|
||||
## Hardware Detection
|
||||
|
||||
Detect the user's hardware at script startup and adapt rendering parameters automatically. Never hardcode worker counts or resolution.
|
||||
|
||||
### CPU and Memory Detection
|
||||
|
||||
```python
|
||||
import multiprocessing
|
||||
import platform
|
||||
import shutil
|
||||
import os
|
||||
|
||||
def detect_hardware():
|
||||
"""Detect hardware capabilities and return render config."""
|
||||
cpu_count = multiprocessing.cpu_count()
|
||||
|
||||
# Leave 1-2 cores free for OS + ffmpeg encoding
|
||||
if cpu_count >= 16:
|
||||
workers = cpu_count - 2
|
||||
elif cpu_count >= 8:
|
||||
workers = cpu_count - 1
|
||||
elif cpu_count >= 4:
|
||||
workers = cpu_count - 1
|
||||
else:
|
||||
workers = max(1, cpu_count)
|
||||
|
||||
# Memory detection (platform-specific)
|
||||
try:
|
||||
if platform.system() == "Darwin":
|
||||
import subprocess
|
||||
mem_bytes = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]).strip())
|
||||
elif platform.system() == "Linux":
|
||||
with open("/proc/meminfo") as f:
|
||||
for line in f:
|
||||
if line.startswith("MemTotal"):
|
||||
mem_bytes = int(line.split()[1]) * 1024
|
||||
break
|
||||
else:
|
||||
mem_bytes = 8 * 1024**3 # assume 8GB on unknown
|
||||
except Exception:
|
||||
mem_bytes = 8 * 1024**3
|
||||
|
||||
mem_gb = mem_bytes / (1024**3)
|
||||
|
||||
# Each worker uses ~50-150MB depending on grid sizes
|
||||
# Cap workers if memory is tight
|
||||
mem_per_worker_mb = 150
|
||||
max_workers_by_mem = int(mem_gb * 1024 * 0.6 / mem_per_worker_mb) # use 60% of RAM
|
||||
workers = min(workers, max_workers_by_mem)
|
||||
|
||||
# ffmpeg availability and codec support
|
||||
has_ffmpeg = shutil.which("ffmpeg") is not None
|
||||
|
||||
return {
|
||||
"cpu_count": cpu_count,
|
||||
"workers": workers,
|
||||
"mem_gb": mem_gb,
|
||||
"platform": platform.system(),
|
||||
"arch": platform.machine(),
|
||||
"has_ffmpeg": has_ffmpeg,
|
||||
}
|
||||
```
|
||||
|
||||
### Adaptive Quality Profiles
|
||||
|
||||
Scale resolution, FPS, CRF, and grid density based on hardware:
|
||||
|
||||
```python
|
||||
def quality_profile(hw, target_duration_s, user_preference="auto"):
|
||||
"""
|
||||
Returns render settings adapted to hardware.
|
||||
user_preference: "auto", "draft", "preview", "production", "max"
|
||||
"""
|
||||
if user_preference == "draft":
|
||||
return {"vw": 960, "vh": 540, "fps": 12, "crf": 28, "workers": min(4, hw["workers"]),
|
||||
"grid_scale": 0.5, "shaders": "minimal", "particles_max": 200}
|
||||
|
||||
if user_preference == "preview":
|
||||
return {"vw": 1280, "vh": 720, "fps": 15, "crf": 25, "workers": hw["workers"],
|
||||
"grid_scale": 0.75, "shaders": "standard", "particles_max": 500}
|
||||
|
||||
if user_preference == "max":
|
||||
return {"vw": 3840, "vh": 2160, "fps": 30, "crf": 15, "workers": hw["workers"],
|
||||
"grid_scale": 2.0, "shaders": "full", "particles_max": 3000}
|
||||
|
||||
# "production" or "auto"
|
||||
# Auto-detect: estimate render time, downgrade if it would take too long
|
||||
n_frames = int(target_duration_s * 24)
|
||||
est_seconds_per_frame = 0.18 # ~180ms at 1080p
|
||||
est_total_s = n_frames * est_seconds_per_frame / max(1, hw["workers"])
|
||||
|
||||
if hw["mem_gb"] < 4 or hw["cpu_count"] <= 2:
|
||||
# Low-end: 720p, 15fps
|
||||
return {"vw": 1280, "vh": 720, "fps": 15, "crf": 23, "workers": hw["workers"],
|
||||
"grid_scale": 0.75, "shaders": "standard", "particles_max": 500}
|
||||
|
||||
if est_total_s > 3600: # would take over an hour
|
||||
# Downgrade to 720p to speed up
|
||||
return {"vw": 1280, "vh": 720, "fps": 24, "crf": 20, "workers": hw["workers"],
|
||||
"grid_scale": 0.75, "shaders": "standard", "particles_max": 800}
|
||||
|
||||
# Standard production: 1080p 24fps
|
||||
return {"vw": 1920, "vh": 1080, "fps": 24, "crf": 20, "workers": hw["workers"],
|
||||
"grid_scale": 1.0, "shaders": "full", "particles_max": 1200}
|
||||
|
||||
|
||||
def apply_quality_profile(profile):
|
||||
"""Set globals from quality profile."""
|
||||
global VW, VH, FPS, N_WORKERS
|
||||
VW = profile["vw"]
|
||||
VH = profile["vh"]
|
||||
FPS = profile["fps"]
|
||||
N_WORKERS = profile["workers"]
|
||||
# Grid sizes scale with resolution
|
||||
# CRF passed to ffmpeg encoder
|
||||
# Shader set determines which post-processing is active
|
||||
```
|
||||
|
||||
### CLI Integration
|
||||
|
||||
```python
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--quality", choices=["draft", "preview", "production", "max", "auto"],
|
||||
default="auto", help="Render quality preset")
|
||||
parser.add_argument("--workers", type=int, default=0, help="Override worker count (0=auto)")
|
||||
parser.add_argument("--resolution", type=str, default="", help="Override resolution e.g. 1280x720")
|
||||
args = parser.parse_args()
|
||||
|
||||
hw = detect_hardware()
|
||||
if args.workers > 0:
|
||||
hw["workers"] = args.workers
|
||||
profile = quality_profile(hw, target_duration, args.quality)
|
||||
if args.resolution:
|
||||
w, h = args.resolution.split("x")
|
||||
profile["vw"], profile["vh"] = int(w), int(h)
|
||||
apply_quality_profile(profile)
|
||||
|
||||
log(f"Hardware: {hw['cpu_count']} cores, {hw['mem_gb']:.1f}GB RAM, {hw['platform']}")
|
||||
log(f"Render: {profile['vw']}x{profile['vh']} @{profile['fps']}fps, "
|
||||
f"CRF {profile['crf']}, {profile['workers']} workers")
|
||||
```
|
||||
|
||||
## Performance Budget
|
||||
|
||||
Target: 100-200ms per frame (5-10 fps single-threaded, 40-80 fps across 8 workers).
|
||||
|
||||
| Component | Time | Notes |
|
||||
|-----------|------|-------|
|
||||
| Feature extraction | 1-5ms | Pre-computed for all frames before render |
|
||||
| Effect function | 2-15ms | Vectorized numpy, avoid Python loops |
|
||||
| Character render | 80-150ms | **Bottleneck** -- per-cell Python loop |
|
||||
| Shader pipeline | 5-25ms | Depends on active shaders |
|
||||
| ffmpeg encode | ~5ms | Amortized by pipe buffering |
|
||||
|
||||
## Bitmap Pre-Rasterization
|
||||
|
||||
Rasterize every character at init, not per-frame:
|
||||
|
||||
```python
|
||||
# At init time -- done once
|
||||
for c in all_characters:
|
||||
img = Image.new("L", (cell_w, cell_h), 0)
|
||||
ImageDraw.Draw(img).text((0, 0), c, fill=255, font=font)
|
||||
bitmaps[c] = np.array(img, dtype=np.float32) / 255.0 # float32 for fast multiply
|
||||
|
||||
# At render time -- fast lookup
|
||||
bitmap = bitmaps[char]
|
||||
canvas[y:y+ch, x:x+cw] = np.maximum(canvas[y:y+ch, x:x+cw],
|
||||
(bitmap[:,:,None] * color).astype(np.uint8))
|
||||
```
|
||||
|
||||
Collect all characters from all palettes + overlay text into the init set. Lazy-init for any missed characters.
|
||||
|
||||
## Coordinate Array Caching
|
||||
|
||||
Pre-compute all grid-relative coordinate arrays at init, not per-frame:
|
||||
|
||||
```python
|
||||
# These are O(rows*cols) and used in every effect
|
||||
self.rr = np.arange(rows)[:, None] # row indices
|
||||
self.cc = np.arange(cols)[None, :] # col indices
|
||||
self.dist = np.sqrt(dx**2 + dy**2) # distance from center
|
||||
self.angle = np.arctan2(dy, dx) # angle from center
|
||||
self.dist_n = ... # normalized distance
|
||||
```
|
||||
|
||||
## Vectorized Effect Patterns
|
||||
|
||||
### Avoid Per-Cell Python Loops in Effects
|
||||
|
||||
The render loop (compositing bitmaps) is unavoidably per-cell. But effect functions must be fully vectorized numpy -- never iterate over rows/cols in Python.
|
||||
|
||||
Bad (O(rows*cols) Python loop):
|
||||
```python
|
||||
for r in range(rows):
|
||||
for c in range(cols):
|
||||
val[r, c] = math.sin(c * 0.1 + t) * math.cos(r * 0.1 - t)
|
||||
```
|
||||
|
||||
Good (vectorized):
|
||||
```python
|
||||
val = np.sin(g.cc * 0.1 + t) * np.cos(g.rr * 0.1 - t)
|
||||
```
|
||||
|
||||
### Vectorized Matrix Rain
|
||||
|
||||
The naive per-column per-trail-pixel loop is the second biggest bottleneck after the render loop. Use numpy fancy indexing:
|
||||
|
||||
```python
|
||||
# Instead of nested Python loops over columns and trail pixels:
|
||||
# Build row index arrays for all active trail pixels at once
|
||||
all_rows = []
|
||||
all_cols = []
|
||||
all_fades = []
|
||||
for c in range(cols):
|
||||
head = int(state["ry"][c])
|
||||
trail_len = state["rln"][c]
|
||||
for i in range(trail_len):
|
||||
row = head - i
|
||||
if 0 <= row < rows:
|
||||
all_rows.append(row)
|
||||
all_cols.append(c)
|
||||
all_fades.append(1.0 - i / trail_len)
|
||||
|
||||
# Vectorized assignment
|
||||
ar = np.array(all_rows)
|
||||
ac = np.array(all_cols)
|
||||
af = np.array(all_fades, dtype=np.float32)
|
||||
# Assign chars and colors in bulk using fancy indexing
|
||||
ch[ar, ac] = ... # vectorized char assignment
|
||||
co[ar, ac, 1] = (af * bri * 255).astype(np.uint8) # green channel
|
||||
```
|
||||
|
||||
### Vectorized Fire Columns
|
||||
|
||||
Same pattern -- accumulate index arrays, assign in bulk:
|
||||
|
||||
```python
|
||||
fire_val = np.zeros((rows, cols), dtype=np.float32)
|
||||
for fi in range(n_cols):
|
||||
fx_c = int((fi * cols / n_cols + np.sin(t * 2 + fi * 0.7) * 3) % cols)
|
||||
height = int(energy * rows * 0.7)
|
||||
dy = np.arange(min(height, rows))
|
||||
fr = rows - 1 - dy
|
||||
frac = dy / max(height, 1)
|
||||
# Width spread: base columns wider at bottom
|
||||
for dx in range(-1, 2): # 3-wide columns
|
||||
c = fx_c + dx
|
||||
if 0 <= c < cols:
|
||||
fire_val[fr, c] = np.maximum(fire_val[fr, c],
|
||||
(1 - frac * 0.6) * (0.5 + rms * 0.5))
|
||||
# Now map fire_val to chars and colors in one vectorized pass
|
||||
```
|
||||
|
||||
## Bloom Optimization
|
||||
|
||||
**Do NOT use `scipy.ndimage.uniform_filter`** -- measured at 424ms/frame.
|
||||
|
||||
Use 4x downsample + manual box blur instead -- 84ms/frame (5x faster):
|
||||
|
||||
```python
|
||||
sm = canvas[::4, ::4].astype(np.float32) # 4x downsample
|
||||
br = np.where(sm > threshold, sm, 0)
|
||||
for _ in range(3): # 3-pass manual box blur
|
||||
p = np.pad(br, ((1,1),(1,1),(0,0)), mode='edge')
|
||||
br = (p[:-2,:-2] + p[:-2,1:-1] + p[:-2,2:] +
|
||||
p[1:-1,:-2] + p[1:-1,1:-1] + p[1:-1,2:] +
|
||||
p[2:,:-2] + p[2:,1:-1] + p[2:,2:]) / 9.0
|
||||
bl = np.repeat(np.repeat(br, 4, axis=0), 4, axis=1)[:H, :W]
|
||||
```
|
||||
|
||||
## Vignette Caching
|
||||
|
||||
Distance field is resolution- and strength-dependent, never changes per frame:
|
||||
|
||||
```python
|
||||
_vig_cache = {}
|
||||
def sh_vignette(canvas, strength):
|
||||
key = (canvas.shape[0], canvas.shape[1], round(strength, 2))
|
||||
if key not in _vig_cache:
|
||||
Y = np.linspace(-1, 1, H)[:, None]
|
||||
X = np.linspace(-1, 1, W)[None, :]
|
||||
_vig_cache[key] = np.clip(1.0 - np.sqrt(X**2+Y**2) * strength, 0.15, 1).astype(np.float32)
|
||||
return np.clip(canvas * _vig_cache[key][:,:,None], 0, 255).astype(np.uint8)
|
||||
```
|
||||
|
||||
Same pattern for CRT barrel distortion (cache remap coordinates).
|
||||
|
||||
## Film Grain Optimization
|
||||
|
||||
Generate noise at half resolution, tile up:
|
||||
|
||||
```python
|
||||
noise = np.random.randint(-amt, amt+1, (H//2, W//2, 1), dtype=np.int16)
|
||||
noise = np.repeat(np.repeat(noise, 2, axis=0), 2, axis=1)[:H, :W]
|
||||
```
|
||||
|
||||
2x blocky grain looks like film grain and costs 1/4 the random generation.
|
||||
|
||||
## Parallel Rendering
|
||||
|
||||
### Worker Architecture
|
||||
|
||||
```python
|
||||
hw = detect_hardware()
|
||||
N_WORKERS = hw["workers"]
|
||||
|
||||
# Batch splitting (for non-clip architectures)
|
||||
batch_size = (n_frames + N_WORKERS - 1) // N_WORKERS
|
||||
batches = [(i, i*batch_size, min((i+1)*batch_size, n_frames), features, seg_path) ...]
|
||||
|
||||
with multiprocessing.Pool(N_WORKERS) as pool:
|
||||
segments = pool.starmap(render_batch, batches)
|
||||
```
|
||||
|
||||
### Per-Clip Parallelism (Preferred for Segmented Videos)
|
||||
|
||||
```python
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
|
||||
with ProcessPoolExecutor(max_workers=N_WORKERS) as pool:
|
||||
futures = {pool.submit(render_clip, seg, features, path): seg["id"]
|
||||
for seg, path in clip_args}
|
||||
for fut in as_completed(futures):
|
||||
clip_id = futures[fut]
|
||||
try:
|
||||
fut.result()
|
||||
log(f" {clip_id} done")
|
||||
except Exception as e:
|
||||
log(f" {clip_id} FAILED: {e}")
|
||||
```
|
||||
|
||||
### Worker Isolation
|
||||
|
||||
Each worker:
|
||||
- Creates its own `Renderer` instance (with full grid + bitmap init)
|
||||
- Opens its own ffmpeg subprocess
|
||||
- Has independent random seed (`random.seed(batch_id * 10000)`)
|
||||
- Writes to its own segment file and stderr log
|
||||
|
||||
### ffmpeg Pipe Safety
|
||||
|
||||
**CRITICAL**: Never `stderr=subprocess.PIPE` with long-running ffmpeg. The stderr buffer fills at ~64KB and deadlocks:
|
||||
|
||||
```python
|
||||
# WRONG -- will deadlock
|
||||
pipe = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
|
||||
# RIGHT -- stderr to file
|
||||
stderr_fh = open(err_path, "w")
|
||||
pipe = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=stderr_fh)
|
||||
# ... write all frames ...
|
||||
pipe.stdin.close()
|
||||
pipe.wait()
|
||||
stderr_fh.close()
|
||||
```
|
||||
|
||||
### Concatenation
|
||||
|
||||
```python
|
||||
with open(concat_file, "w") as cf:
|
||||
for seg in segments:
|
||||
cf.write(f"file '{seg}'\n")
|
||||
|
||||
cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", concat_file]
|
||||
if audio_path:
|
||||
cmd += ["-i", audio_path, "-c:v", "copy", "-c:a", "aac", "-b:a", "192k", "-shortest"]
|
||||
else:
|
||||
cmd += ["-c:v", "copy"]
|
||||
cmd.append(output_path)
|
||||
subprocess.run(cmd, capture_output=True, check=True)
|
||||
```
|
||||
|
||||
## Particle System Performance
|
||||
|
||||
Cap particle counts based on quality profile:
|
||||
|
||||
| System | Low | Standard | High |
|
||||
|--------|-----|----------|------|
|
||||
| Explosion | 300 | 1000 | 2500 |
|
||||
| Embers | 500 | 1500 | 3000 |
|
||||
| Starfield | 300 | 800 | 1500 |
|
||||
| Dissolve | 200 | 600 | 1200 |
|
||||
|
||||
Cull by truncating lists:
|
||||
```python
|
||||
MAX_PARTICLES = profile.get("particles_max", 1200)
|
||||
if len(S["px"]) > MAX_PARTICLES:
|
||||
for k in ("px", "py", "vx", "vy", "life", "char"):
|
||||
S[k] = S[k][-MAX_PARTICLES:] # keep newest
|
||||
```
|
||||
|
||||
## Memory Management
|
||||
|
||||
- Feature arrays: pre-computed for all frames, shared across workers via fork semantics (COW)
|
||||
- Canvas: allocated once per worker, reused (`np.zeros(...)`)
|
||||
- Character arrays: allocated per frame (cheap -- rows*cols U1 strings)
|
||||
- Bitmap cache: ~500KB per grid size, initialized once per worker
|
||||
|
||||
Total memory per worker: ~50-150MB. Total: ~400-800MB for 8 workers.
|
||||
|
||||
For low-memory systems (< 4GB), reduce worker count and use smaller grids.
|
||||
|
||||
## Brightness Verification
|
||||
|
||||
After render, spot-check brightness at sample timestamps:
|
||||
|
||||
```python
|
||||
for t in [2, 30, 60, 120, 180]:
|
||||
cmd = ["ffmpeg", "-ss", str(t), "-i", output_path,
|
||||
"-frames:v", "1", "-f", "rawvideo", "-pix_fmt", "rgb24", "-"]
|
||||
r = subprocess.run(cmd, capture_output=True)
|
||||
arr = np.frombuffer(r.stdout, dtype=np.uint8)
|
||||
print(f"t={t}s mean={arr.mean():.1f} max={arr.max()}")
|
||||
```
|
||||
|
||||
Target: mean > 5 for quiet sections, mean > 15 for active sections. If consistently below, increase brightness floor in effects and/or global boost multiplier.
|
||||
|
||||
## Render Time Estimates
|
||||
|
||||
Scale with hardware. Baseline: 1080p, 24fps, ~180ms/frame/worker.
|
||||
|
||||
| Duration | Frames | 4 workers | 8 workers | 16 workers |
|
||||
|----------|--------|-----------|-----------|------------|
|
||||
| 30s | 720 | ~3 min | ~2 min | ~1 min |
|
||||
| 2 min | 2,880 | ~13 min | ~7 min | ~4 min |
|
||||
| 3.5 min | 5,040 | ~23 min | ~12 min | ~6 min |
|
||||
| 5 min | 7,200 | ~33 min | ~17 min | ~9 min |
|
||||
| 10 min | 14,400 | ~65 min | ~33 min | ~17 min |
|
||||
|
||||
At 720p: multiply times by ~0.5. At 4K: multiply by ~4.
|
||||
|
||||
Heavier effects (many particles, dense grids, extra shader passes) add ~20-50%.
|
||||
Loading…
Add table
Add a link
Reference in a new issue