fix video pipeline: timecode sync, face framing in prompts, no grouping when timecodes present
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
1313568857
commit
d0bb727baa
2 changed files with 44 additions and 31 deletions
|
|
@ -117,6 +117,15 @@ Rules for scene structure fields:
|
||||||
- outcome: what this scene resolves or leads to (null if opening scene)
|
- outcome: what this scene resolves or leads to (null if opening scene)
|
||||||
- story_value: narrative purpose — one of: hook|problem|feature|climax|resolution|cta|transition
|
- story_value: narrative purpose — one of: hook|problem|feature|climax|resolution|cta|transition
|
||||||
|
|
||||||
|
Rules for timecode and duration:
|
||||||
|
- timecode: MUST be the cumulative start time of this scene (HH:MM:SS). Scene 1 starts at 00:00:00. Each next scene starts at previous timecode + previous duration_sec.
|
||||||
|
- duration_sec: MUST exactly match the length of the voiceover text. Count ~2.5 words per second for Russian, ~3 words per second for English.
|
||||||
|
- voiceover and duration_sec MUST be in sync: if voiceover has 15 words in Russian → duration_sec = 6.
|
||||||
|
|
||||||
|
Rules for action (visual description):
|
||||||
|
- If the video involves people or faces: ALWAYS describe a tight face/portrait shot unless the scene specifically needs wide frame. Example: "Close-up of man's face, intense expression, dramatic lighting"
|
||||||
|
- Describe the main subject clearly — camera framing (close-up, medium shot, wide), subject action, lighting
|
||||||
|
|
||||||
Response format:
|
Response format:
|
||||||
{
|
{
|
||||||
"title": "video title",
|
"title": "video title",
|
||||||
|
|
@ -137,7 +146,7 @@ Response format:
|
||||||
"outcome": "what this resolves or leads to, or null",
|
"outcome": "what this resolves or leads to, or null",
|
||||||
"story_value": "hook|problem|feature|climax|resolution|cta|transition",
|
"story_value": "hook|problem|feature|climax|resolution|cta|transition",
|
||||||
"voiceover": "exact words spoken by narrator in target language",
|
"voiceover": "exact words spoken by narrator in target language",
|
||||||
"action": "detailed English description of what is on screen: camera, subject, movement, lighting"
|
"action": "detailed English description of what is on screen: camera framing, subject, movement, lighting"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"music_mood": "upbeat|calm|dramatic|funny|inspirational",
|
"music_mood": "upbeat|calm|dramatic|funny|inspirational",
|
||||||
|
|
|
||||||
|
|
@ -88,37 +88,30 @@ def parse_json_transcription(json_path):
|
||||||
combined_text = ". ".join(texts).replace("\n", " ").replace("\\n", " ")
|
combined_text = ". ".join(texts).replace("\n", " ").replace("\\n", " ")
|
||||||
print(f" Объединённый текст: {combined_text[:100]}...")
|
print(f" Объединённый текст: {combined_text[:100]}...")
|
||||||
|
|
||||||
# Определяем время
|
# Определяем время — приоритет: timecode (story-gen video mode) > start_time > source > i*duration
|
||||||
if "selected_timestamp" in data.get("source", {}):
|
def parse_timecode(tc: str) -> float:
|
||||||
# Используем таймкод из source
|
parts = tc.strip().split(':')
|
||||||
time_str = data["source"]["selected_timestamp"]
|
|
||||||
print(f" Время из source.selected_timestamp: {time_str}")
|
|
||||||
parts = time_str.split(':')
|
|
||||||
if len(parts) == 3:
|
if len(parts) == 3:
|
||||||
hours, minutes, seconds = parts
|
return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
|
||||||
total_seconds = int(hours) * 3600 + int(minutes) * 60 + float(seconds)
|
|
||||||
elif len(parts) == 2:
|
elif len(parts) == 2:
|
||||||
minutes, seconds = parts
|
return int(parts[0]) * 60 + float(parts[1])
|
||||||
total_seconds = int(minutes) * 60 + float(seconds)
|
return float(parts[0])
|
||||||
else:
|
|
||||||
total_seconds = float(parts[0])
|
if "timecode" in scene and scene["timecode"]:
|
||||||
elif "start_time" in scene:
|
total_seconds = parse_timecode(scene["timecode"])
|
||||||
start_time_str = scene["start_time"]
|
print(f" Время из scene.timecode: {scene['timecode']} → {total_seconds}с")
|
||||||
print(f" Время из scene.start_time: {start_time_str}")
|
elif "start_time" in scene and scene["start_time"]:
|
||||||
# Парсим время в секундах
|
total_seconds = parse_timecode(scene["start_time"])
|
||||||
parts = start_time_str.split(':')
|
print(f" Время из scene.start_time: {scene['start_time']} → {total_seconds}с")
|
||||||
if len(parts) == 3:
|
elif "selected_timestamp" in data.get("source", {}):
|
||||||
hours, minutes, seconds = parts
|
total_seconds = parse_timecode(data["source"]["selected_timestamp"])
|
||||||
total_seconds = int(hours) * 3600 + int(minutes) * 60 + float(seconds)
|
print(f" Время из source.selected_timestamp: {total_seconds}с")
|
||||||
elif len(parts) == 2:
|
|
||||||
minutes, seconds = parts
|
|
||||||
total_seconds = int(minutes) * 60 + float(seconds)
|
|
||||||
else:
|
|
||||||
total_seconds = float(parts[0])
|
|
||||||
else:
|
else:
|
||||||
# Если нет времени, используем индекс сцены * 5 секунд
|
# Fallback: накапливаем по duration_sec предыдущих сцен
|
||||||
total_seconds = i * 5
|
total_seconds = sum(
|
||||||
print(f" Время не найдено, используем индекс: {total_seconds}с")
|
s.get("duration_sec", 5) for s in scenes[:i]
|
||||||
|
)
|
||||||
|
print(f" Время вычислено по duration_sec: {total_seconds}с")
|
||||||
|
|
||||||
entries.append((total_seconds, combined_text, scene_id))
|
entries.append((total_seconds, combined_text, scene_id))
|
||||||
print(f" {GREEN}✓ Добавлена фраза для озвучки{RESET}")
|
print(f" {GREEN}✓ Добавлена фраза для озвучки{RESET}")
|
||||||
|
|
@ -201,8 +194,19 @@ async def main_async(json_path, output_dir, voice, max_chars):
|
||||||
for ts, text, scene_id in entries:
|
for ts, text, scene_id in entries:
|
||||||
print(f" - [{ts:.2f}с] Сцена {scene_id}: {text[:50]}...")
|
print(f" - [{ts:.2f}с] Сцена {scene_id}: {text[:50]}...")
|
||||||
|
|
||||||
print(f"\n{YELLOW}Группировка фраз (макс. {max_chars} символов)...{RESET}")
|
# Если сцены имеют точные таймкоды — не группируем, каждая сцена отдельный сегмент
|
||||||
grouped_entries = group_sentences(entries, max_chars=max_chars)
|
try:
|
||||||
|
_raw = json.loads(Path(json_path).read_text(encoding='utf-8'))
|
||||||
|
has_timecodes = any(s.get("timecode") for s in _raw.get("scenes", []))
|
||||||
|
except Exception:
|
||||||
|
has_timecodes = False
|
||||||
|
|
||||||
|
if has_timecodes:
|
||||||
|
print(f"{GREEN}Режим точных таймкодов — каждая сцена отдельный сегмент.{RESET}")
|
||||||
|
grouped_entries = [(ts, text, [sid]) for ts, text, sid in entries]
|
||||||
|
else:
|
||||||
|
print(f"\n{YELLOW}Группировка фраз (макс. {max_chars} символов)...{RESET}")
|
||||||
|
grouped_entries = group_sentences(entries, max_chars=max_chars)
|
||||||
print(f"{GREEN}Сформировано {len(grouped_entries)} фрагментов для озвучки.{RESET}")
|
print(f"{GREEN}Сформировано {len(grouped_entries)} фрагментов для озвучки.{RESET}")
|
||||||
|
|
||||||
manifest_path = output_path / "segments.txt"
|
manifest_path = output_path / "segments.txt"
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue