fix video pipeline: timecode sync, face framing in prompts, no grouping when timecodes present

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-21 00:25:34 +03:00 · 2026-04-21 00:25:34 +03:00 · d0bb727baa
commit d0bb727baa
parent 1313568857
2 changed files with 44 additions and 31 deletions
--- a/story-gen/scripts/generate.py
+++ b/story-gen/scripts/generate.py
@ -117,6 +117,15 @@ Rules for scene structure fields:
 - outcome: what this scene resolves or leads to (null if opening scene)
 - story_value: narrative purpose — one of: hook|problem|feature|climax|resolution|cta|transition
 Rules for timecode and duration:
 - timecode: MUST be the cumulative start time of this scene (HH:MM:SS). Scene 1 starts at 00:00:00. Each next scene starts at previous timecode + previous duration_sec.
 - duration_sec: MUST exactly match the length of the voiceover text. Count ~2.5 words per second for Russian, ~3 words per second for English.
 - voiceover and duration_sec MUST be in sync: if voiceover has 15 words in Russian → duration_sec = 6.
 Rules for action (visual description):
 - If the video involves people or faces: ALWAYS describe a tight face/portrait shot unless the scene specifically needs wide frame. Example: "Close-up of man's face, intense expression, dramatic lighting"
 - Describe the main subject clearly — camera framing (close-up, medium shot, wide), subject action, lighting
 Response format:
 {
  "title": "video title",
@ -137,7 +146,7 @@ Response format:
      "outcome": "what this resolves or leads to, or null",
      "story_value": "hook|problem|feature|climax|resolution|cta|transition",
      "voiceover": "exact words spoken by narrator in target language",
-      "action": "detailed English description of what is on screen: camera, subject, movement, lighting"
+      "action": "detailed English description of what is on screen: camera framing, subject, movement, lighting"
    }
  ],
  "music_mood": "upbeat|calm|dramatic|funny|inspirational",
--- a/voice/tts_generate.py
+++ b/voice/tts_generate.py
@ -88,37 +88,30 @@ def parse_json_transcription(json_path):
        combined_text = ". ".join(texts).replace("\n", " ").replace("\\n", " ")
        print(f"  Объединённый текст: {combined_text[:100]}...")
-        # Определяем время
+        # Определяем время — приоритет: timecode (story-gen video mode) > start_time > source > i*duration
-        if "selected_timestamp" in data.get("source", {}):
+        def parse_timecode(tc: str) -> float:
-            # Используем таймкод из source
+            parts = tc.strip().split(':')
            time_str = data["source"]["selected_timestamp"]
            print(f"  Время из source.selected_timestamp: {time_str}")
            parts = time_str.split(':')
            if len(parts) == 3:
-                hours, minutes, seconds = parts
+                return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
                total_seconds = int(hours) * 3600 + int(minutes) * 60 + float(seconds)
            elif len(parts) == 2:
-                minutes, seconds = parts
+                return int(parts[0]) * 60 + float(parts[1])
-                total_seconds = int(minutes) * 60 + float(seconds)
+            return float(parts[0])
-            else:
+
-                total_seconds = float(parts[0])
+        if "timecode" in scene and scene["timecode"]:
-        elif "start_time" in scene:
+            total_seconds = parse_timecode(scene["timecode"])
-            start_time_str = scene["start_time"]
+            print(f"  Время из scene.timecode: {scene['timecode']} → {total_seconds}с")
-            print(f"  Время из scene.start_time: {start_time_str}")
+        elif "start_time" in scene and scene["start_time"]:
-            # Парсим время в секундах
+            total_seconds = parse_timecode(scene["start_time"])
-            parts = start_time_str.split(':')
+            print(f"  Время из scene.start_time: {scene['start_time']} → {total_seconds}с")
-            if len(parts) == 3:
+        elif "selected_timestamp" in data.get("source", {}):
-                hours, minutes, seconds = parts
+            total_seconds = parse_timecode(data["source"]["selected_timestamp"])
-                total_seconds = int(hours) * 3600 + int(minutes) * 60 + float(seconds)
+            print(f"  Время из source.selected_timestamp: {total_seconds}с")
            elif len(parts) == 2:
                minutes, seconds = parts
                total_seconds = int(minutes) * 60 + float(seconds)
            else:
                total_seconds = float(parts[0])
        else:
-            # Если нет времени, используем индекс сцены * 5 секунд
+            # Fallback: накапливаем по duration_sec предыдущих сцен
-            total_seconds = i * 5
+            total_seconds = sum(
-            print(f"  Время не найдено, используем индекс: {total_seconds}с")
+                s.get("duration_sec", 5) for s in scenes[:i]
            )
            print(f"  Время вычислено по duration_sec: {total_seconds}с")
        entries.append((total_seconds, combined_text, scene_id))
        print(f"  {GREEN}✓ Добавлена фраза для озвучки{RESET}")
@ -201,8 +194,19 @@ async def main_async(json_path, output_dir, voice, max_chars):
    for ts, text, scene_id in entries:
        print(f"  - [{ts:.2f}с] Сцена {scene_id}: {text[:50]}...")
-    print(f"\n{YELLOW}Группировка фраз (макс. {max_chars} символов)...{RESET}")
+    # Если сцены имеют точные таймкоды — не группируем, каждая сцена отдельный сегмент
-    grouped_entries = group_sentences(entries, max_chars=max_chars)
+    try:
        _raw = json.loads(Path(json_path).read_text(encoding='utf-8'))
        has_timecodes = any(s.get("timecode") for s in _raw.get("scenes", []))
    except Exception:
        has_timecodes = False
    if has_timecodes:
        print(f"{GREEN}Режим точных таймкодов — каждая сцена отдельный сегмент.{RESET}")
        grouped_entries = [(ts, text, [sid]) for ts, text, sid in entries]
    else:
        print(f"\n{YELLOW}Группировка фраз (макс. {max_chars} символов)...{RESET}")
        grouped_entries = group_sentences(entries, max_chars=max_chars)
    print(f"{GREEN}Сформировано {len(grouped_entries)} фрагментов для озвучки.{RESET}")
    manifest_path = output_path / "segments.txt"