fix video pipeline: timecode sync, face framing in prompts, no grouping when timecodes present
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
1313568857
commit
d0bb727baa
2 changed files with 44 additions and 31 deletions
|
|
@ -117,6 +117,15 @@ Rules for scene structure fields:
|
|||
- outcome: what this scene resolves or leads to (null if opening scene)
|
||||
- story_value: narrative purpose — one of: hook|problem|feature|climax|resolution|cta|transition
|
||||
|
||||
Rules for timecode and duration:
|
||||
- timecode: MUST be the cumulative start time of this scene (HH:MM:SS). Scene 1 starts at 00:00:00. Each next scene starts at previous timecode + previous duration_sec.
|
||||
- duration_sec: MUST exactly match the length of the voiceover text. Count ~2.5 words per second for Russian, ~3 words per second for English.
|
||||
- voiceover and duration_sec MUST be in sync: if voiceover has 15 words in Russian → duration_sec = 6.
|
||||
|
||||
Rules for action (visual description):
|
||||
- If the video involves people or faces: ALWAYS describe a tight face/portrait shot unless the scene specifically needs wide frame. Example: "Close-up of man's face, intense expression, dramatic lighting"
|
||||
- Describe the main subject clearly — camera framing (close-up, medium shot, wide), subject action, lighting
|
||||
|
||||
Response format:
|
||||
{
|
||||
"title": "video title",
|
||||
|
|
@ -137,7 +146,7 @@ Response format:
|
|||
"outcome": "what this resolves or leads to, or null",
|
||||
"story_value": "hook|problem|feature|climax|resolution|cta|transition",
|
||||
"voiceover": "exact words spoken by narrator in target language",
|
||||
"action": "detailed English description of what is on screen: camera, subject, movement, lighting"
|
||||
"action": "detailed English description of what is on screen: camera framing, subject, movement, lighting"
|
||||
}
|
||||
],
|
||||
"music_mood": "upbeat|calm|dramatic|funny|inspirational",
|
||||
|
|
|
|||
|
|
@ -88,37 +88,30 @@ def parse_json_transcription(json_path):
|
|||
combined_text = ". ".join(texts).replace("\n", " ").replace("\\n", " ")
|
||||
print(f" Объединённый текст: {combined_text[:100]}...")
|
||||
|
||||
# Определяем время
|
||||
if "selected_timestamp" in data.get("source", {}):
|
||||
# Используем таймкод из source
|
||||
time_str = data["source"]["selected_timestamp"]
|
||||
print(f" Время из source.selected_timestamp: {time_str}")
|
||||
parts = time_str.split(':')
|
||||
# Определяем время — приоритет: timecode (story-gen video mode) > start_time > source > i*duration
|
||||
def parse_timecode(tc: str) -> float:
|
||||
parts = tc.strip().split(':')
|
||||
if len(parts) == 3:
|
||||
hours, minutes, seconds = parts
|
||||
total_seconds = int(hours) * 3600 + int(minutes) * 60 + float(seconds)
|
||||
return int(parts[0]) * 3600 + int(parts[1]) * 60 + float(parts[2])
|
||||
elif len(parts) == 2:
|
||||
minutes, seconds = parts
|
||||
total_seconds = int(minutes) * 60 + float(seconds)
|
||||
else:
|
||||
total_seconds = float(parts[0])
|
||||
elif "start_time" in scene:
|
||||
start_time_str = scene["start_time"]
|
||||
print(f" Время из scene.start_time: {start_time_str}")
|
||||
# Парсим время в секундах
|
||||
parts = start_time_str.split(':')
|
||||
if len(parts) == 3:
|
||||
hours, minutes, seconds = parts
|
||||
total_seconds = int(hours) * 3600 + int(minutes) * 60 + float(seconds)
|
||||
elif len(parts) == 2:
|
||||
minutes, seconds = parts
|
||||
total_seconds = int(minutes) * 60 + float(seconds)
|
||||
else:
|
||||
total_seconds = float(parts[0])
|
||||
return int(parts[0]) * 60 + float(parts[1])
|
||||
return float(parts[0])
|
||||
|
||||
if "timecode" in scene and scene["timecode"]:
|
||||
total_seconds = parse_timecode(scene["timecode"])
|
||||
print(f" Время из scene.timecode: {scene['timecode']} → {total_seconds}с")
|
||||
elif "start_time" in scene and scene["start_time"]:
|
||||
total_seconds = parse_timecode(scene["start_time"])
|
||||
print(f" Время из scene.start_time: {scene['start_time']} → {total_seconds}с")
|
||||
elif "selected_timestamp" in data.get("source", {}):
|
||||
total_seconds = parse_timecode(data["source"]["selected_timestamp"])
|
||||
print(f" Время из source.selected_timestamp: {total_seconds}с")
|
||||
else:
|
||||
# Если нет времени, используем индекс сцены * 5 секунд
|
||||
total_seconds = i * 5
|
||||
print(f" Время не найдено, используем индекс: {total_seconds}с")
|
||||
# Fallback: накапливаем по duration_sec предыдущих сцен
|
||||
total_seconds = sum(
|
||||
s.get("duration_sec", 5) for s in scenes[:i]
|
||||
)
|
||||
print(f" Время вычислено по duration_sec: {total_seconds}с")
|
||||
|
||||
entries.append((total_seconds, combined_text, scene_id))
|
||||
print(f" {GREEN}✓ Добавлена фраза для озвучки{RESET}")
|
||||
|
|
@ -201,8 +194,19 @@ async def main_async(json_path, output_dir, voice, max_chars):
|
|||
for ts, text, scene_id in entries:
|
||||
print(f" - [{ts:.2f}с] Сцена {scene_id}: {text[:50]}...")
|
||||
|
||||
print(f"\n{YELLOW}Группировка фраз (макс. {max_chars} символов)...{RESET}")
|
||||
grouped_entries = group_sentences(entries, max_chars=max_chars)
|
||||
# Если сцены имеют точные таймкоды — не группируем, каждая сцена отдельный сегмент
|
||||
try:
|
||||
_raw = json.loads(Path(json_path).read_text(encoding='utf-8'))
|
||||
has_timecodes = any(s.get("timecode") for s in _raw.get("scenes", []))
|
||||
except Exception:
|
||||
has_timecodes = False
|
||||
|
||||
if has_timecodes:
|
||||
print(f"{GREEN}Режим точных таймкодов — каждая сцена отдельный сегмент.{RESET}")
|
||||
grouped_entries = [(ts, text, [sid]) for ts, text, sid in entries]
|
||||
else:
|
||||
print(f"\n{YELLOW}Группировка фраз (макс. {max_chars} символов)...{RESET}")
|
||||
grouped_entries = group_sentences(entries, max_chars=max_chars)
|
||||
print(f"{GREEN}Сформировано {len(grouped_entries)} фрагментов для озвучки.{RESET}")
|
||||
|
||||
manifest_path = output_path / "segments.txt"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue