fix video pipeline: timecode sync, face framing in prompts, no grouping when timecodes present

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
AMEfremova 2026-04-21 00:25:34 +03:00
parent 1313568857
commit d0bb727baa
2 changed files with 44 additions and 31 deletions

View file

@ -117,6 +117,15 @@ Rules for scene structure fields:
- outcome: what this scene resolves or leads to (null if opening scene)
- story_value: narrative purpose one of: hook|problem|feature|climax|resolution|cta|transition
Rules for timecode and duration:
- timecode: MUST be the cumulative start time of this scene (HH:MM:SS). Scene 1 starts at 00:00:00. Each next scene starts at previous timecode + previous duration_sec.
- duration_sec: MUST exactly match the length of the voiceover text. Count ~2.5 words per second for Russian, ~3 words per second for English.
- voiceover and duration_sec MUST be in sync: if voiceover has 15 words in Russian duration_sec = 6.
Rules for action (visual description):
- If the video involves people or faces: ALWAYS describe a tight face/portrait shot unless the scene specifically needs wide frame. Example: "Close-up of man's face, intense expression, dramatic lighting"
- Describe the main subject clearly camera framing (close-up, medium shot, wide), subject action, lighting
Response format:
{
"title": "video title",
@ -137,7 +146,7 @@ Response format:
"outcome": "what this resolves or leads to, or null",
"story_value": "hook|problem|feature|climax|resolution|cta|transition",
"voiceover": "exact words spoken by narrator in target language",
"action": "detailed English description of what is on screen: camera, subject, movement, lighting"
"action": "detailed English description of what is on screen: camera framing, subject, movement, lighting"
}
],
"music_mood": "upbeat|calm|dramatic|funny|inspirational",

View file

@ -88,37 +88,30 @@ def parse_json_transcription(json_path):
combined_text = ". ".join(texts).replace("\n", " ").replace("\\n", " ")
print(f" Объединённый текст: {combined_text[:100]}...")
# Определяем время
if "selected_timestamp" in data.get("source", {}):
# Используем таймкод из source
time_str = data["source"]["selected_timestamp"]
print(f" Время из source.selected_timestamp: {time_str}")
parts = time_str.split(':')
# Определяем время — приоритет: timecode (story-gen video mode) > start_time > source > i*duration
def parse_timecode(tc: str) -> float:
parts = tc.strip().split(':')
if len(parts) == 3:
hours, minutes, seconds = parts
total_seconds = int(hours) * 3600 + int(minutes) * 60 + float(seconds)
return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
elif len(parts) == 2:
minutes, seconds = parts
total_seconds = int(minutes) * 60 + float(seconds)
else:
total_seconds = float(parts[0])
elif "start_time" in scene:
start_time_str = scene["start_time"]
print(f" Время из scene.start_time: {start_time_str}")
# Парсим время в секундах
parts = start_time_str.split(':')
if len(parts) == 3:
hours, minutes, seconds = parts
total_seconds = int(hours) * 3600 + int(minutes) * 60 + float(seconds)
elif len(parts) == 2:
minutes, seconds = parts
total_seconds = int(minutes) * 60 + float(seconds)
else:
total_seconds = float(parts[0])
return int(parts[0]) * 60 + float(parts[1])
return float(parts[0])
if "timecode" in scene and scene["timecode"]:
total_seconds = parse_timecode(scene["timecode"])
print(f" Время из scene.timecode: {scene['timecode']}{total_seconds}с")
elif "start_time" in scene and scene["start_time"]:
total_seconds = parse_timecode(scene["start_time"])
print(f" Время из scene.start_time: {scene['start_time']}{total_seconds}с")
elif "selected_timestamp" in data.get("source", {}):
total_seconds = parse_timecode(data["source"]["selected_timestamp"])
print(f" Время из source.selected_timestamp: {total_seconds}с")
else:
# Если нет времени, используем индекс сцены * 5 секунд
total_seconds = i * 5
print(f" Время не найдено, используем индекс: {total_seconds}с")
# Fallback: накапливаем по duration_sec предыдущих сцен
total_seconds = sum(
s.get("duration_sec", 5) for s in scenes[:i]
)
print(f" Время вычислено по duration_sec: {total_seconds}с")
entries.append((total_seconds, combined_text, scene_id))
print(f" {GREEN}✓ Добавлена фраза для озвучки{RESET}")
@ -201,8 +194,19 @@ async def main_async(json_path, output_dir, voice, max_chars):
for ts, text, scene_id in entries:
print(f" - [{ts:.2f}с] Сцена {scene_id}: {text[:50]}...")
print(f"\n{YELLOW}Группировка фраз (макс. {max_chars} символов)...{RESET}")
grouped_entries = group_sentences(entries, max_chars=max_chars)
# Если сцены имеют точные таймкоды — не группируем, каждая сцена отдельный сегмент
try:
_raw = json.loads(Path(json_path).read_text(encoding='utf-8'))
has_timecodes = any(s.get("timecode") for s in _raw.get("scenes", []))
except Exception:
has_timecodes = False
if has_timecodes:
print(f"{GREEN}Режим точных таймкодов — каждая сцена отдельный сегмент.{RESET}")
grouped_entries = [(ts, text, [sid]) for ts, text, sid in entries]
else:
print(f"\n{YELLOW}Группировка фраз (макс. {max_chars} символов)...{RESET}")
grouped_entries = group_sentences(entries, max_chars=max_chars)
print(f"{GREEN}Сформировано {len(grouped_entries)} фрагментов для озвучки.{RESET}")
manifest_path = output_path / "segments.txt"