From a45765facc58f570f8c8d4094bcf9fae54a8eb6b Mon Sep 17 00:00:00 2001 From: AMEfremova Date: Fri, 3 Apr 2026 16:24:29 +0300 Subject: [PATCH 1/4] add video mode to generate.py with auto voice synthesis pipeline - new --mode video generates full shooting script (timecode + voiceover + action per scene) - auto-saves _voiceover.txt in [HH:MM:SS] format ready for voice_acting.py - new --voice flag immediately runs voice synthesis after script generation - new --voice-out flag to control segments output directory - update SKILL.md to document both modes and pipeline integration Co-Authored-By: Claude Sonnet 4.6 --- story-gen/SKILL.md | 98 ++++++++--- story-gen/scripts/generate.py | 156 +++++++++++++++++- story-gen/scripts/trends/aged_self.txt | 15 ++ story-gen/scripts/trends/anime.txt | 15 ++ .../scripts/trends/children_together.txt | 15 ++ story-gen/scripts/trends/doll_in_box.txt | 15 ++ story-gen/scripts/trends/flowers_in_hair.txt | 15 ++ story-gen/scripts/trends/hug_inner_child.txt | 15 ++ story-gen/scripts/trends/pet_as_human.txt | 15 ++ story-gen/scripts/trends/photo_booth.txt | 15 ++ story-gen/scripts/trends/soviet_era.txt | 15 ++ story-gen/scripts/trends/studio_photo.txt | 15 ++ .../scripts/trends/three_generations.txt | 15 ++ 13 files changed, 395 insertions(+), 24 deletions(-) create mode 100644 story-gen/scripts/trends/aged_self.txt create mode 100644 story-gen/scripts/trends/anime.txt create mode 100644 story-gen/scripts/trends/children_together.txt create mode 100644 story-gen/scripts/trends/doll_in_box.txt create mode 100644 story-gen/scripts/trends/flowers_in_hair.txt create mode 100644 story-gen/scripts/trends/hug_inner_child.txt create mode 100644 story-gen/scripts/trends/pet_as_human.txt create mode 100644 story-gen/scripts/trends/photo_booth.txt create mode 100644 story-gen/scripts/trends/soviet_era.txt create mode 100644 story-gen/scripts/trends/studio_photo.txt create mode 100644 story-gen/scripts/trends/three_generations.txt diff --git a/story-gen/SKILL.md b/story-gen/SKILL.md index 7ea3d03..3116058 100644 --- a/story-gen/SKILL.md +++ b/story-gen/SKILL.md @@ -34,47 +34,74 @@ Needs env: - `OPENAI_BASE_URL` — endpoint (default: https://llm.lambda.coredump.ru/v1) - `STORY_MODEL` — model (default: qwen3.5-122b) +## Two modes + +### `--mode image` (default) +Generates a storyboard scenario for image/visual generation. Each scene has a `visual_prompt` (English) ready for gpt-image-1.5 or veo-3.1. + +### `--mode video` +Generates a full shooting script for real video production. Each scene has: +- `timecode` — cumulative start time `HH:MM:SS` +- `voiceover` — exact words spoken by narrator (in target language) +- `action` — what happens on screen in English (for director / video generation) + +**Automatically saves two files when `--out` is given:** +- `scenario.json` — full structured script +- `scenario_voiceover.txt` — ready for `voice/voice_acting.py` in `[HH:MM:SS] text` format + ## Parameters | Parameter | Values | Description | |-----------|--------|-------------| -| `--format` | `wb_ad`, `reels`, `viral`, `long`, `postcard`, `educational`, `auto` | Video format | +| `--mode` | `image`, `video` | `image`: visual storyboard; `video`: full shooting script with voiceover | +| `--format` | `wb_ad`, `reels`, `viral`, `long`, `postcard`, `educational`, `auto` | Video format (image mode only) | | `--platform` | `tiktok`, `instagram`, `wb`, `youtube`, `vk`, `auto` | Target platform | | `--audience` | any text | Target audience description | | `--duration` | seconds | Target duration | | `--lang` | `ru`, `en`, `de`, `auto` | Language for voiceover and captions | -| `--analyze` | flag | Analyze assets before generating scenario | -| `--out` | filepath | Save JSON to file | +| `--analyze` | flag | Analyze assets before generating (image mode only) | +| `--out` | filepath | Save JSON to file (video mode also saves `_voiceover.txt`) | +| `--voice` | flag | After script generation, immediately run voice synthesis (video mode + `--out` required) | +| `--voice-out` | dirpath | Directory for voice segments (default: `voice_segments/` next to `--out`) | ## Usage examples ```bash -# WB product ad (Russian input, Russian voiceover) +# WB product ad — image storyboard (default mode) python3 {baseDir}/scripts/generate.py \ "Женская сумка из экокожи, бежевая, 2500 руб" \ --format wb_ad --platform wb -# Viral TikTok (Russian input, English voiceover) +# Full video shooting script + automatically run voice synthesis +python3 {baseDir}/scripts/generate.py \ + "Обзор беговых кроссовок Nike для TikTok" \ + --mode video --platform tiktok --duration 60 --lang ru \ + --out assets/scenario.json --voice +# → saves assets/scenario.json +# → saves assets/scenario_voiceover.txt +# → runs voice_acting.py → saves wav segments to assets/voice_segments/ +# → saves assets/voice_segments/segments.txt (manifest for combine_audio.sh) + +# Without auto voice (manual step later): +python3 {baseDir}/scripts/generate.py \ + "Обзор беговых кроссовок Nike для TikTok" \ + --mode video --platform tiktok --duration 60 --lang ru \ + --out assets/scenario.json +# Then manually: +python3 voice/voice_acting.py assets/scenario_voiceover.txt -o assets/voice_segments + +# Viral TikTok image storyboard (English voiceover) python3 {baseDir}/scripts/generate.py \ "Анекдот про программиста и кофе" \ --format viral --platform tiktok --lang en -# With asset analysis (auto-detect format and platform) -python3 {baseDir}/scripts/generate.py \ - "Женская сумка, фото на белом фоне, WB карточка" \ - --analyze - -# Long educational video +# Long educational video shooting script python3 {baseDir}/scripts/generate.py \ "How to choose your first bicycle" \ - --format long --platform youtube --duration 120 --lang en - -# Save to file for pipeline -python3 {baseDir}/scripts/generate.py \ - "Котёнок впервые видит снег" \ - --format viral --out /tmp/scenario.json + --mode video --platform youtube --duration 120 --lang en \ + --out assets/bicycle_scenario.json ``` -## Output JSON structure +## Output JSON — image mode ```json { "title": "video title", @@ -89,23 +116,50 @@ python3 {baseDir}/scripts/generate.py \ { "id": 1, "duration_sec": 5, - "visual_prompt": "ALWAYS IN ENGLISH — detailed prompt for gpt-image-1.5 or veo-3.1, style, lighting, camera angle, colors", + "visual_prompt": "ALWAYS IN ENGLISH — detailed prompt for gpt-image-1.5 or veo-3.1", "visual_type": "image|video_clip|text_only", "voiceover": "narration text in target language", "caption": "on-screen text in target language" } ], - "storyboard_grid_prompt": "NxN storyboard grid — all scenes as one image for consistent character generation. Feed into gpt-image-1 or nanobanana. null if no recurring subject.", + "storyboard_grid_prompt": "NxN storyboard grid — all scenes as one image. null if no recurring subject.", "music_mood": "upbeat|calm|dramatic|funny|inspirational", "style_notes": "overall style and delivery notes", "asset_analysis": null } ``` +## Output JSON — video mode +```json +{ + "title": "video title", + "platform": "tiktok|instagram|wb|youtube|vk", + "language": "ru|en|...", + "duration_sec": 60, + "hook": "first 3 seconds — what grabs attention", + "target_audience": "who watches this", + "scenes": [ + { + "id": 1, + "timecode": "00:00:00", + "duration_sec": 5, + "voiceover": "exact words spoken by narrator in target language", + "action": "detailed English description of what is on screen: camera, subject, movement, lighting" + } + ], + "music_mood": "upbeat|calm|dramatic|funny|inspirational", + "style_notes": "overall visual style, pacing, tone" +} +``` + ## Pipeline integration -Output feeds directly into: +**Image mode** output feeds into: - `visual_prompt` → image generation (`gpt-image-1.5`) or video (`veo-3.1`) -- `voiceover` → TTS (`Pocket-TTS` or `ElevenLabs`) +- `voiceover` → TTS - `caption` + `duration_sec` → ffmpeg montage (`ffmpeg-editing/SKILL.md`) + +**Video mode** output feeds into: +- `_voiceover.txt` → `voice/voice_acting.py` for speech synthesis +- `action` per scene → video generation or director instructions - Full JSON → orchestrator (`../SKILL.md`) diff --git a/story-gen/scripts/generate.py b/story-gen/scripts/generate.py index 5862d72..0a1937c 100755 --- a/story-gen/scripts/generate.py +++ b/story-gen/scripts/generate.py @@ -5,7 +5,7 @@ Supports any platform, audience, content format, and language. Input can be in any language. Skill code and prompts are English-only. """ -import sys, os, json, argparse +import sys, os, json, argparse, subprocess from urllib import request, error # Fix Windows console encoding (cp1251 can't handle ₽, emoji, etc.) @@ -67,6 +67,43 @@ Response format: "asset_analysis": null }""" +VIDEO_SYSTEM_PROMPT = """You are a professional scriptwriter for real video production (not image generation). + +You receive a topic, idea, or product description in ANY language. +You return ONLY valid JSON with no markdown or comments. + +Your job: write a full shooting script with voiceover and on-screen action descriptions, timed to the second. + +Rules: +- timecode: cumulative start time in format HH:MM:SS +- voiceover: what the narrator says out loud — exact words, in the input language (or --lang if given) +- action: what happens on screen — camera movement, subject action, scene change — detailed, in English +- voiceover and action run in parallel (same time block) +- duration_sec: how long this scene lasts +- No scene shorter than 3 seconds + +Response format: +{ + "title": "video title", + "platform": "tiktok|instagram|wb|youtube|vk", + "language": "ru|en|...", + "duration_sec": 60, + "hook": "first 3 seconds — what grabs attention", + "target_audience": "who watches this", + "scenes": [ + { + "id": 1, + "timecode": "00:00:00", + "duration_sec": 5, + "voiceover": "exact words spoken by narrator in target language", + "action": "detailed English description of what is on screen: camera, subject, movement, lighting" + } + ], + "music_mood": "upbeat|calm|dramatic|funny|inspirational", + "style_notes": "overall visual style, pacing, tone" +}""" + + ASSET_PROMPT = """You analyze input data for video production. Your task is to extract maximum information from what is given. @@ -163,9 +200,66 @@ def generate(input_text, format_hint="auto", platform="auto", return json.loads(content.strip()) +def generate_video(input_text, platform="auto", audience="", duration=None, lang="auto"): + """Generate a full video shooting script with timecoded voiceover and action descriptions.""" + if not API_KEY: + print("Error: OPENAI_API_KEY not set", file=sys.stderr) + sys.exit(1) + + parts = [f"Input: {input_text}"] + if platform != "auto": parts.append(f"Platform: {platform}") + if audience: parts.append(f"Target audience: {audience}") + if duration: parts.append(f"Duration: {duration} seconds") + if lang != "auto": parts.append(f"Output language: {lang}") + + payload = json.dumps({ + "model": MODEL, + "messages": [ + {"role": "system", "content": VIDEO_SYSTEM_PROMPT}, + {"role": "user", "content": "\n".join(parts)} + ], + "temperature": 0.8 + }).encode() + + req = request.Request( + f"{API_URL}/chat/completions", + data=payload, + headers={"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"} + ) + + try: + with request.urlopen(req, timeout=60) as resp: + data = json.loads(resp.read()) + except error.URLError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + content = data["choices"][0]["message"]["content"].strip() + if content.startswith("```"): + content = content.split("```")[1] + if content.startswith("json"): + content = content[4:] + + return json.loads(content.strip()) + + +def scenario_to_video_txt(scenario: dict) -> str: + """Convert video scenario JSON to video.txt format: [HH:MM:SS] voiceover text.""" + lines = [] + for scene in scenario.get("scenes", []): + tc = scene.get("timecode", "00:00:00") + voiceover = scene.get("voiceover", "").strip() + if voiceover: + lines.append(f"[{tc}] {voiceover}") + return "\n".join(lines) + + def main(): parser = argparse.ArgumentParser(description="Universal video scenario generator") parser.add_argument("input", help="Product description, idea, topic, or joke in any language") + parser.add_argument("--mode", default="image", + choices=["image", "video"], + help="image: storyboard for image/visual generation; video: full shooting script with voiceover") parser.add_argument("--format", default="auto", choices=["auto","wb_ad","reels","viral","long","postcard","educational"]) parser.add_argument("--platform", default="auto", @@ -173,10 +267,68 @@ def main(): parser.add_argument("--audience", default="", help="Target audience description") parser.add_argument("--duration", type=int, default=None, help="Target duration in seconds") parser.add_argument("--lang", default="auto", help="Output language: ru, en, de, auto") - parser.add_argument("--analyze", action="store_true", help="Analyze assets before generating") + parser.add_argument("--analyze", action="store_true", help="Analyze assets before generating (image mode only)") parser.add_argument("--out", default=None, help="Save JSON output to file") + parser.add_argument("--voice", action="store_true", + help="After generating video script, immediately run voice synthesis (video mode only). Requires --out.") + parser.add_argument("--voice-out", default=None, + help="Output directory for voice segments (default: voice/segments next to --out)") args = parser.parse_args() + if args.mode == "video": + print("Generating video shooting script...", file=sys.stderr) + result = generate_video(args.input, args.platform, args.audience, args.duration, args.lang) + print(f" Title: {result.get('title')}", file=sys.stderr) + print(f" Scenes: {len(result.get('scenes', []))}", file=sys.stderr) + print(f" Duration: {result.get('duration_sec')}s", file=sys.stderr) + + output = json.dumps(result, ensure_ascii=False, indent=2) + + # Always need --out for video mode to save voiceover txt + if not args.out: + print("Warning: use --out to save scenario and voiceover files", file=sys.stderr) + print(output) + return + + os.makedirs(os.path.dirname(os.path.abspath(args.out)), exist_ok=True) + with open(args.out, "w", encoding="utf-8") as f: + f.write(output) + print(f"Saved scenario to {args.out}", file=sys.stderr) + + # Save voiceover txt for voice_acting.py + base = args.out[:-5] if args.out.endswith(".json") else args.out + video_txt_path = base + "_voiceover.txt" + video_txt = scenario_to_video_txt(result) + with open(video_txt_path, "w", encoding="utf-8") as f: + f.write(video_txt) + print(f"Saved voiceover script to {video_txt_path}", file=sys.stderr) + + print(output) + + # Run voice synthesis if requested + if args.voice: + voice_script = os.path.join( + os.path.dirname(__file__), "..", "..", "voice", "voice_acting.py" + ) + voice_script = os.path.normpath(voice_script) + if not os.path.exists(voice_script): + print(f"Error: voice_acting.py not found at {voice_script}", file=sys.stderr) + sys.exit(1) + + segments_dir = args.voice_out or os.path.join( + os.path.dirname(os.path.abspath(args.out)), "voice_segments" + ) + print(f"\nStarting voice synthesis → {segments_dir}", file=sys.stderr) + cmd = [sys.executable, voice_script, video_txt_path, "-o", segments_dir] + result_proc = subprocess.run(cmd) + if result_proc.returncode != 0: + print("Voice synthesis failed", file=sys.stderr) + sys.exit(result_proc.returncode) + print(f"Voice segments saved to {segments_dir}", file=sys.stderr) + print(f"Manifest: {segments_dir}/segments.txt", file=sys.stderr) + + return + assets = None if args.analyze: print("Analyzing assets...", file=sys.stderr) diff --git a/story-gen/scripts/trends/aged_self.txt b/story-gen/scripts/trends/aged_self.txt new file mode 100644 index 0000000..6777ce2 --- /dev/null +++ b/story-gen/scripts/trends/aged_self.txt @@ -0,0 +1,15 @@ +keywords: aged self, old me, elderly me, я в старости, старый я, как я буду выглядеть, future me, future self, старею, состарься, age yourself, пожилой я + +Transform this photo into a human portrait. Use the uploaded photo for the face — preserve ALL features exactly: face shape, eyes, nose, lips, eyebrows, hair color, hairstyle — but aged naturally. + +Clothing: Elderly elegant attire — sophisticated cardigan, classic trousers/skirt, comfortable yet dignified, silver jewelry, reading glasses. + +Location: Cozy sunlit reading nook with bookshelves, comfortable armchair, plants, warm home environment, peaceful setting. + +Pose & Action: Sitting comfortably in chair, holding a book, gentle smile, looking content and wise, relaxed posture. + +Lighting: Soft natural window light, warm afternoon glow, gentle shadows, peaceful atmosphere. + +Mood: Peaceful, wise, content, reflective, serene, life well-lived. + +Technical: 85mm f/2.0, ISO 400, portrait photography, natural aging effects, realistic wrinkles and gray hair, warm color palette. diff --git a/story-gen/scripts/trends/anime.txt b/story-gen/scripts/trends/anime.txt new file mode 100644 index 0000000..2002563 --- /dev/null +++ b/story-gen/scripts/trends/anime.txt @@ -0,0 +1,15 @@ +keywords: anime, аниме, manga, манга, ghibli, гибли, japanese style, японский стиль, anime portrait, аниме портрет + +Transform this photo into a human portrait. Use the uploaded photo for the face — preserve ALL features exactly: face shape, eyes, nose, lips, eyebrows, hair color, hairstyle. + +Clothing: Japanese school uniform style — navy blue blazer with gold buttons, white collared shirt, red ribbon tie, pleated skirt. + +Location: Cherry blossom tree lined street in spring, soft pink petals falling, urban Japanese neighborhood background. + +Pose & Action: Standing casually with hands behind back, slight smile, wind gently blowing hair and skirt. + +Lighting: Soft diffused daylight with warm golden hour glow filtering through cherry blossoms. + +Mood: Dreamy, romantic, nostalgic, youthful energy. + +Technical: Anime/manga art style, Studio Ghibli inspired, cel shading, vibrant colors, 50mm equivalent, high detail illustration. diff --git a/story-gen/scripts/trends/children_together.txt b/story-gen/scripts/trends/children_together.txt new file mode 100644 index 0000000..d49c06f --- /dev/null +++ b/story-gen/scripts/trends/children_together.txt @@ -0,0 +1,15 @@ +keywords: children together, kids together, siblings, дети вместе, детский портрет, два ребенка, siblings photo, family kids + +Transform this photo into a human portrait. Use the uploaded photo for the face — preserve ALL features exactly: face shape, eyes, nose, lips, eyebrows, hair color, hairstyle. + +Clothing: Child 1 — colorful striped t-shirt with denim overalls, red sneakers. Child 2 — pastel yellow dress with white cardigan, brown leather shoes. + +Location: Sunny backyard garden with green grass, colorful flowers, wooden fence, summer afternoon setting. + +Pose & Action: Two children sitting on grass together, laughing, one holding a butterfly, natural interaction between them. + +Lighting: Natural warm afternoon sunlight, dappled light through tree leaves, soft shadows. + +Mood: Joyful, innocent, playful, heartwarming, childhood nostalgia. + +Technical: 50mm f/2.0, ISO 200, candid lifestyle photography, warm color grading, shallow depth of field. diff --git a/story-gen/scripts/trends/doll_in_box.txt b/story-gen/scripts/trends/doll_in_box.txt new file mode 100644 index 0000000..5c95628 --- /dev/null +++ b/story-gen/scripts/trends/doll_in_box.txt @@ -0,0 +1,15 @@ +keywords: doll, barbie, кукла, барби, doll in box, кукла в коробке, miniature, миниатюра, toy box, living doll, живая кукла + +Transform this photo into a human portrait. Use the uploaded photo for the face — preserve ALL features exactly: face shape, eyes, nose, lips, eyebrows, hair color, hairstyle. + +Clothing: Vintage doll-style outfit — frilly white lace dress with pink ribbon details, tiny pearl necklace, miniature leather shoes. + +Location: Inside an ornate vintage wooden jewelry box with red velvet lining, surrounded by miniature accessories. + +Pose & Action: Sitting cross-legged inside the box, hands resting on knees, looking up curiously, doll-like stillness. + +Lighting: Warm dramatic spotlight from above, creating theatrical box lighting with soft shadows around edges. + +Mood: Whimsical, mysterious, magical, fairy-tale-like, surreal. + +Technical: 100mm macro f/2.8, ISO 400, hyperrealistic miniature photography, extreme detail, tilt-shift effect. diff --git a/story-gen/scripts/trends/flowers_in_hair.txt b/story-gen/scripts/trends/flowers_in_hair.txt new file mode 100644 index 0000000..3775aae --- /dev/null +++ b/story-gen/scripts/trends/flowers_in_hair.txt @@ -0,0 +1,15 @@ +keywords: flowers in hair, цветы в волосах, flowers growing from hair, цветы растут из волос, floral hair, цветочные волосы, magical flowers, magical portrait, магический портрет, botanical portrait, ботанический портрет + +Transform this photo into a human portrait. Use the uploaded photo for the face — preserve ALL features exactly: face shape, eyes, nose, lips, eyebrows, hair color, hairstyle. + +Clothing: Ethereal flowing dress in soft neutral tones, delicate fabric that complements the flowers, minimal jewelry to not distract. + +Location: Dark mystical garden background with soft bokeh, enchanted forest atmosphere, magical ambiance. + +Pose & Action: Face completely still and motionless, head facing forward, serene expression, flowers growing from beneath hair on both sides without touching face. + +Lighting: Soft dramatic lighting highlighting face and flowers, subtle rim light for separation, magical glow on flowers. + +Mood: Magical, ethereal, enchanting, mystical, serene, otherworldly beauty. + +Technical: 85mm f/1.4, ISO 200, high-resolution portrait, animation-ready composition. FLOWERS: Multiple realistic lilies and orchids in dark shades growing from under hair on both sides, not touching face. Flowers must look extremely natural with detailed textures and natural color tones. Long stems, abundant growth spreading outward in various directions as if cascading from hair. Flowers gradually blooming and expanding behind hair, creating lush magical effect. Face remains completely still and unmoving during animation. Animation must be smooth and harmonious while maintaining image clarity. diff --git a/story-gen/scripts/trends/hug_inner_child.txt b/story-gen/scripts/trends/hug_inner_child.txt new file mode 100644 index 0000000..e197411 --- /dev/null +++ b/story-gen/scripts/trends/hug_inner_child.txt @@ -0,0 +1,15 @@ +keywords: inner child, hug yourself, inner child healing, обними себя, внутренний ребенок, я в детстве, yourself as child, ты в детстве, self hug, healing, исцеление, meet your younger self + +Transform this photo into a human portrait. Use the uploaded photo for the face — preserve ALL features exactly: face shape, eyes, nose, lips, eyebrows, hair color, hairstyle. + +Clothing: Adult you in contemporary professional attire, child version in nostalgic childhood outfit from old photos (colorful t-shirt, jeans, sneakers). + +Location: Dreamy abstract space with soft clouds, golden light beams, timeless void background, surreal meeting point. + +Pose & Action: Adult kneeling to child's level, warm embrace, both faces showing emotion, hands on shoulders, intimate hug. + +Lighting: Ethereal golden divine light from above, soft glow around both figures, heavenly atmosphere. + +Mood: Emotional, healing, self-love, nostalgic, powerful, transformative. + +Technical: 85mm f/1.4, ISO 200, conceptual portrait photography, soft dreamy bokeh, cinematic color grading. diff --git a/story-gen/scripts/trends/pet_as_human.txt b/story-gen/scripts/trends/pet_as_human.txt new file mode 100644 index 0000000..f324088 --- /dev/null +++ b/story-gen/scripts/trends/pet_as_human.txt @@ -0,0 +1,15 @@ +keywords: pet as human, animal as human, кот как человек, собака как человек, питомец как человек, anthropomorphic pet, антропоморфный питомец, humanized pet, мой кот человек, мой питомец человек + +Transform this photo into a human portrait. Use the uploaded photo for the face — preserve ALL features exactly: face shape, eyes, nose, lips, eyebrows, hair color, hairstyle. + +Clothing: Anthropomorphic elegant outfit — tailored vest with bow tie, dress pants, pocket watch chain, sophisticated accessories matching pet's original coloring. + +Location: Cozy Victorian-style living room with fireplace, antique furniture, warm home atmosphere. + +Pose & Action: Sitting in armchair like a person, one paw/hand resting on armrest, dignified human-like posture, intelligent expression. + +Lighting: Warm fireplace glow mixed with soft ambient room lighting, candlelight accents. + +Mood: Charming, whimsical, sophisticated, humorous yet elegant. + +Technical: 85mm f/1.8, ISO 400, portrait photography style, anthropomorphic fantasy art, high detail fur/texture. diff --git a/story-gen/scripts/trends/photo_booth.txt b/story-gen/scripts/trends/photo_booth.txt new file mode 100644 index 0000000..8781418 --- /dev/null +++ b/story-gen/scripts/trends/photo_booth.txt @@ -0,0 +1,15 @@ +keywords: photo booth, фотобудка, photo strip, полоска фото, vintage photo booth, ретро фотобудка, 4 photos, четыре фото, fun photos, веселые фото, booth strip + +Transform this photo into a human portrait. Use the uploaded photo for the face — preserve ALL features exactly: face shape, eyes, nose, lips, eyebrows, hair color, hairstyle. + +Clothing: Fun casual outfit — colorful graphic t-shirt, denim jacket, playful accessories, props like hats and glasses. + +Location: Classic photo booth interior with red curtain backdrop, vintage booth frame visible, strip of 4 photos layout. + +Pose & Action: Four different expressions in photo strip — smiling, making funny face, serious, laughing — natural photo booth sequence. + +Lighting: Classic photo booth flash lighting, slightly harsh but nostalgic, even illumination. + +Mood: Fun, playful, nostalgic, casual, spontaneous, youthful. + +Technical: 35mm equivalent, ISO 800, vintage photo booth aesthetic, film grain, strip layout, retro color tones. diff --git a/story-gen/scripts/trends/soviet_era.txt b/story-gen/scripts/trends/soviet_era.txt new file mode 100644 index 0000000..23239fe --- /dev/null +++ b/story-gen/scripts/trends/soviet_era.txt @@ -0,0 +1,15 @@ +keywords: soviet, soviet era, soviet style, советский, советская эпоха, советское фото, ussr, ссср, retro russia, ретро россия, vintage russian, ностальгия советский + +Transform this photo into a human portrait. Use the uploaded photo for the face — preserve ALL features exactly: face shape, eyes, nose, lips, eyebrows, hair color, hairstyle. + +Clothing: Soviet-era clothing — wool coat with fur collar, traditional ushanka hat, dark boots, vintage scarf. + +Location: Red Square Moscow backdrop, historic Soviet architecture, snow-covered cobblestone streets, winter setting. + +Pose & Action: Standing formally with hands clasped, dignified posture, looking slightly off-camera, timeless pose. + +Lighting: Soft overcast winter light, muted tones, vintage film grain effect. + +Mood: Nostalgic, historical, solemn, patriotic, retro atmosphere. + +Technical: 35mm f/4.0, ISO 800, vintage film simulation, sepia/warm tint, aged postcard texture, grain overlay. diff --git a/story-gen/scripts/trends/studio_photo.txt b/story-gen/scripts/trends/studio_photo.txt new file mode 100644 index 0000000..4d9b19c --- /dev/null +++ b/story-gen/scripts/trends/studio_photo.txt @@ -0,0 +1,15 @@ +keywords: studio photo, professional photo, professional portrait, business portrait, headshot, linkedin photo, корпоративное фото, профессиональное фото, студийное фото, деловой портрет + +Transform this photo into a human portrait. Use the uploaded photo for the face — preserve ALL features exactly: face shape, eyes, nose, lips, eyebrows, hair color, hairstyle. + +Clothing: Elegant black tailored blazer with white silk button-up shirt, dark charcoal dress pants, minimal silver jewelry. + +Location: Professional photography studio with seamless gray backdrop, professional equipment visible in soft focus background. + +Pose & Action: Standing confidently with one hand in pocket, slight lean forward, direct eye contact with camera, professional posture. + +Lighting: Three-point studio lighting setup with softbox key light, fill light, and rim light for dramatic separation. + +Mood: Professional, confident, polished, sophisticated. + +Technical: 85mm f/1.4, ISO 100, high-resolution commercial portrait photography, sharp focus on eyes, creamy bokeh background. diff --git a/story-gen/scripts/trends/three_generations.txt b/story-gen/scripts/trends/three_generations.txt new file mode 100644 index 0000000..53f8f8e --- /dev/null +++ b/story-gen/scripts/trends/three_generations.txt @@ -0,0 +1,15 @@ +keywords: three generations, три поколения, grandma, бабушка, family generations, поколения семьи, deceased relative, умерший родственник, встреча поколений, memory photo, фото памяти + +Transform this photo into a human portrait. Use the uploaded photo for the face — preserve ALL features exactly: face shape, eyes, nose, lips, eyebrows, hair color, hairstyle. + +Clothing: Three generations — Young you in modern casual wear, elderly grandmother in traditional dress with shawl, deceased loved one in their signature outfit from old photo. + +Location: Nostalgic family home interior, vintage furniture, photo albums on table, warm domestic setting. + +Pose & Action: Three people sitting together on sofa, arms around each other, genuine emotional connection, looking at old photographs together. + +Lighting: Soft warm indoor lighting, golden hour window light, intimate family atmosphere. + +Mood: Emotional, nostalgic, loving, bittersweet, family connection, remembrance. + +Technical: 50mm f/2.0, ISO 640, documentary family photography, warm color grading, soft focus background. From 6f5c0f04fee9eaad608ac7ad1a45ada8d1aa67bb Mon Sep 17 00:00:00 2001 From: AMEfremova Date: Fri, 3 Apr 2026 22:11:09 +0300 Subject: [PATCH 2/4] fix video mode and trend matching pipeline - video mode now delegates to generate.py instead of generating images - trend matching uses file-based trends/ with LLM fallback - generate.py system prompt updated to image-only mode (no video_clip) - trend without --photo falls back to plain image generation Co-Authored-By: Claude Sonnet 4.6 --- story-gen/preview.py | 93 ++++++++++++++++++++++------------- story-gen/scripts/generate.py | 28 ++++++----- story-gen/scripts/trend.py | 81 +++++++++++++++++++++++++++--- 3 files changed, 150 insertions(+), 52 deletions(-) diff --git a/story-gen/preview.py b/story-gen/preview.py index 8e4ee29..682a133 100644 --- a/story-gen/preview.py +++ b/story-gen/preview.py @@ -90,12 +90,14 @@ def generate_scenario(user_input: str, fmt="auto", platform="auto") -> dict: # --------------------------------------------------------------------------- -# Step 2b: trend — portrait prompt expansion +# Step 2b: trend — portrait prompt expansion (file-based + LLM fallback) # --------------------------------------------------------------------------- -TREND_PROMPT = """You are a professional prompt engineer for AI portrait image generation. +TRENDS_DIR = os.path.join(os.path.dirname(__file__), "scripts", "trends") -The user gives you a short phrase describing a popular visual trend (TikTok, Instagram, Pinterest). +TREND_FALLBACK_PROMPT = """You are a professional prompt engineer for AI portrait image generation. + +The user gives you a short phrase describing a visual style or trend. Expand it into a detailed professional portrait prompt. Structure: @@ -111,15 +113,47 @@ Technical: [lens, aperture, ISO, art style, quality tags] Rules: - Write entirely in English - Be very specific and detailed -- Base on what is actually trending on TikTok/Instagram/Pinterest - Return ONLY the prompt text, no explanations """ +def _load_trends() -> list[dict]: + trends = [] + if not os.path.isdir(TRENDS_DIR): + return trends + for fname in os.listdir(TRENDS_DIR): + if not fname.endswith(".txt"): + continue + with open(os.path.join(TRENDS_DIR, fname), encoding="utf-8") as f: + content = f.read().strip() + lines = content.splitlines() + keywords = [] + prompt_lines = [] + for i, line in enumerate(lines): + if line.startswith("keywords:"): + keywords = [k.strip().lower() for k in line[len("keywords:"):].split(",") if k.strip()] + else: + prompt_lines = lines[i:] + break + trends.append({ + "name": fname.replace(".txt", ""), + "keywords": keywords, + "prompt": "\n".join(prompt_lines).strip() + }) + return trends + + def expand_trend(user_input: str) -> str: + query = user_input.lower() + for trend in _load_trends(): + if any(kw in query for kw in trend["keywords"]): + print(f" Matched trend: {trend['name']}", file=sys.stderr) + return trend["prompt"] + + print(" No trend matched — using LLM to generate prompt", file=sys.stderr) payload = json.dumps({ "model": MODEL, "messages": [ - {"role": "system", "content": TREND_PROMPT}, + {"role": "system", "content": TREND_FALLBACK_PROMPT}, {"role": "user", "content": user_input} ], "temperature": 0.7 @@ -295,43 +329,36 @@ def main(): request_type, reason = classify(args.input) print(f" Type: {request_type} — {reason}", file=sys.stderr) - # --- VIDEO --- + # --- VIDEO → только сценарий, без генерации картинок --- if request_type == "video": - print(" Building scenario...", file=sys.stderr) - scenario = generate_scenario(args.input, args.format, args.platform) + print(" Building video scenario...", file=sys.stderr) + scripts_dir = os.path.join(os.path.dirname(__file__), "scripts") + sys.path.insert(0, scripts_dir) + import generate as gen + scenario = gen.generate_video(args.input, args.platform, audience="", lang="auto") print(f" Title: {scenario['title']}", file=sys.stderr) print(f" Scenes: {len(scenario['scenes'])}", file=sys.stderr) - - if args.scene is not None: - scenes = scenario["scenes"] - idx = args.scene - 1 - if idx < 0 or idx >= len(scenes): - print(f"Error: scene {args.scene} not found (total: {len(scenes)})", file=sys.stderr) - sys.exit(1) - prompt = scenes[idx]["visual_prompt"] - label = f"scene{args.scene}_{scenario['title']}" - print(f" Rendering scene {args.scene}", file=sys.stderr) - else: - grid_prompt = scenario.get("storyboard_grid_prompt") - prompt = grid_prompt if grid_prompt else scenario["scenes"][0]["visual_prompt"] - label = f"grid_{scenario['title']}" - print(f" Rendering storyboard grid", file=sys.stderr) - - image_bytes = generate_image(prompt, args.size) - save_and_open(image_bytes, label) - print("\nFull scenario:") + print(f" Duration: {scenario.get('duration_sec')}s", file=sys.stderr) print(json.dumps(scenario, ensure_ascii=False, indent=2)) # --- TREND --- elif request_type == "trend": - if args.photo and not os.path.exists(args.photo): + if not args.photo: + # No photo — trend transform is impossible, fall back to plain image generation + print(" Trend detected but no --photo provided — generating image directly", file=sys.stderr) + prompt = expand_image_prompt(args.input) + print(f"\nPrompt:\n{prompt}\n", file=sys.stderr) + image_bytes = generate_image(prompt, args.size) + save_and_open(image_bytes, f"image_{args.input[:30]}") + elif not os.path.exists(args.photo): print(f"Error: photo not found: {args.photo}", file=sys.stderr) sys.exit(1) - print(" Expanding trend prompt...", file=sys.stderr) - prompt = expand_trend(args.input) - print(f"\nPrompt:\n{prompt}\n", file=sys.stderr) - image_bytes = generate_image(prompt, args.size, photo_path=args.photo) - save_and_open(image_bytes, f"trend_{args.input[:30]}") + else: + print(" Expanding trend prompt...", file=sys.stderr) + prompt = expand_trend(args.input) + print(f"\nPrompt:\n{prompt}\n", file=sys.stderr) + image_bytes = generate_image(prompt, args.size, photo_path=args.photo) + save_and_open(image_bytes, f"trend_{args.input[:30]}") # --- IMAGE --- else: diff --git a/story-gen/scripts/generate.py b/story-gen/scripts/generate.py index 0a1937c..a8a2d6b 100755 --- a/story-gen/scripts/generate.py +++ b/story-gen/scripts/generate.py @@ -16,18 +16,24 @@ API_URL = os.environ.get("OPENAI_BASE_URL", "https://llm.lambda.coredump.ru/v1") API_KEY = os.environ.get("OPENAI_API_KEY", "") MODEL = os.environ.get("STORY_MODEL", "qwen3.5-122b") -SYSTEM_PROMPT = """You are a professional scriptwriter for viral and advertising video content. -You know the patterns of TikTok, Instagram Reels, YouTube Shorts, and Wildberries ads. -You understand how to adapt content for different platforms and audiences. +SYSTEM_PROMPT = """You are a professional storyboard creator for image-based video production. + +This is IMAGE MODE. Every scene must be a STATIC IMAGE generated by an AI image model (gpt-image-1.5). +Do NOT describe motion, camera movement, or video clips. Describe a single frozen frame per scene. You receive a product description, idea, or topic in ANY language. You return ONLY valid JSON with no markdown or comments. Rules for visual_prompt: - ALWAYS in English regardless of input or output language -- Maximum detail: style, lighting, camera angle, colors, atmosphere -- Optimized for gpt-image-1.5 and veo-3.1 generation -- Example: "Close-up of beige leather handbag on marble surface, soft natural window light, luxury minimal style, 4k sharp" +- Describe ONE static image: subject, composition, lighting, colors, mood, art style +- No motion words: no "pan", "zoom", "slow", "moving", "transition" +- Optimized for gpt-image-1.5 image generation +- Example: "Woman holding beige leather handbag, marble studio background, soft natural window light, luxury minimal style, 4k sharp" + +Rules for visual_type: +- ALWAYS set to "image" — this is image mode, never "video_clip" +- Use "text_only" only for pure text/title cards with no subject Rules for voiceover and caption: - Match the language specified in the lang parameter @@ -38,8 +44,8 @@ Rules for storyboard_grid_prompt: - Describes ALL scenes as a single image grid for consistent character/subject generation - Grid size: 2x2 for 2-4 scenes, 3x3 for 5-9 scenes, 4x4 for 10-16 scenes - Format: "NxN storyboard grid, same [subject] throughout: [top-left] scene1 description [top-right] scene2 description [bottom-left] scene3 description [bottom-right] scene4 description. Consistent [style notes], photorealistic, 8k" -- Purpose: feed directly into gpt-image-1 or nanobanana to generate all frames at once with consistent appearance -- If the video has no recurring character or subject (e.g. abstract/text-only), set to null +- Purpose: feed directly into gpt-image-1 to generate all frames at once with consistent appearance +- If no recurring character or subject, set to null Response format: { @@ -55,13 +61,13 @@ Response format: { "id": 1, "duration_sec": 5, - "visual_prompt": "detailed English prompt for gpt-image-1.5 or veo-3.1", - "visual_type": "image|video_clip|text_only", + "visual_prompt": "static image prompt in English for gpt-image-1.5", + "visual_type": "image", "voiceover": "narration text in target language", "caption": "on-screen text in target language" } ], - "storyboard_grid_prompt": "NxN storyboard grid prompt for consistent frame generation, or null", + "storyboard_grid_prompt": "NxN storyboard grid prompt or null", "music_mood": "upbeat|calm|dramatic|funny|inspirational", "style_notes": "overall style and delivery notes", "asset_analysis": null diff --git a/story-gen/scripts/trend.py b/story-gen/scripts/trend.py index bdc0e3d..50e0ccd 100644 --- a/story-gen/scripts/trend.py +++ b/story-gen/scripts/trend.py @@ -2,9 +2,9 @@ """ trend.py — portrait trend prompt enhancer. -User writes a short phrase like "аниме" or "я в средневековье". -LLM expands it into a professional portrait prompt based on current internet trends. -Optionally generates the image immediately. +Checks if the user's request matches one of the known trends (stored in trends/ folder). +If matched — returns the curated prompt directly. +If not matched — asks LLM to generate a prompt based on the request. Usage: python story-gen/scripts/trend.py "аниме" @@ -23,9 +23,60 @@ API_KEY = os.environ.get("OPENAI_API_KEY", "") MODEL = os.environ.get("STORY_MODEL", "qwen3.5-122b") IMAGE_MODEL = os.environ.get("IMAGE_MODEL", "gpt-image-1") +TRENDS_DIR = os.path.join(os.path.dirname(__file__), "trends") + +# --------------------------------------------------------------------------- +# Trend matching +# --------------------------------------------------------------------------- + +def load_trends() -> list[dict]: + """Load all trend files. Returns list of {name, keywords, prompt}.""" + trends = [] + if not os.path.isdir(TRENDS_DIR): + return trends + for fname in os.listdir(TRENDS_DIR): + if not fname.endswith(".txt"): + continue + path = os.path.join(TRENDS_DIR, fname) + with open(path, encoding="utf-8") as f: + content = f.read().strip() + # First line: "keywords: word1, word2, ..." + lines = content.splitlines() + keywords = [] + prompt_lines = [] + for i, line in enumerate(lines): + if line.startswith("keywords:"): + raw = line[len("keywords:"):].strip() + keywords = [k.strip().lower() for k in raw.split(",") if k.strip()] + else: + prompt_lines = lines[i:] + break + prompt = "\n".join(prompt_lines).strip() + trends.append({ + "name": fname.replace(".txt", ""), + "keywords": keywords, + "prompt": prompt + }) + return trends + + +def match_trend(user_input: str, trends: list[dict]) -> dict | None: + """Return matched trend dict or None if no match.""" + query = user_input.lower() + for trend in trends: + for kw in trend["keywords"]: + if kw in query: + return trend + return None + + +# --------------------------------------------------------------------------- +# LLM fallback for unknown trends +# --------------------------------------------------------------------------- + SYSTEM_PROMPT = """You are a professional prompt engineer for AI portrait image generation. -The user gives you a short casual phrase describing a popular visual trend (TikTok, Instagram, Pinterest). +The user gives you a short casual phrase describing a visual style or trend. Your job: expand it into a detailed, professional portrait prompt. Always structure the output as follows: @@ -47,13 +98,12 @@ Technical: [camera lens, aperture, ISO, art style, quality tags] Rules: - Write entirely in English - Be very specific and detailed in every section -- Base your output on what is actually trending on TikTok/Instagram/Pinterest right now - The prompt must be ready to paste directly into an image generation model - Return ONLY the prompt text, no explanations, no markdown """ -def expand_trend(user_input: str) -> str: +def expand_trend_llm(user_input: str) -> str: payload = json.dumps({ "model": MODEL, "messages": [ @@ -72,6 +122,21 @@ def expand_trend(user_input: str) -> str: return data["choices"][0]["message"]["content"].strip() +def expand_trend(user_input: str) -> tuple[str, str]: + """Returns (prompt, source) where source is trend name or 'llm'.""" + trends = load_trends() + matched = match_trend(user_input, trends) + if matched: + print(f" Matched trend: {matched['name']}", file=sys.stderr) + return matched["prompt"], matched["name"] + print(" No trend matched — using LLM to generate prompt", file=sys.stderr) + return expand_trend_llm(user_input), "llm" + + +# --------------------------------------------------------------------------- +# Image generation +# --------------------------------------------------------------------------- + def generate_image(prompt: str, size: str) -> bytes: print(" Generating image...", file=sys.stderr) payload = json.dumps({ @@ -115,8 +180,8 @@ def main(): print("Error: OPENAI_API_KEY not set", file=sys.stderr) sys.exit(1) - print(f"Expanding trend: {args.input!r}", file=sys.stderr) - prompt = expand_trend(args.input) + print(f"Processing trend request: {args.input!r}", file=sys.stderr) + prompt, _ = expand_trend(args.input) print(prompt) From b4cf13e916f8c2a82216b263920f31665dc7158e Mon Sep 17 00:00:00 2001 From: AMEfremova Date: Fri, 3 Apr 2026 22:25:43 +0300 Subject: [PATCH 3/4] add WB product commercial pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - new wb_product classifier type in preview.py — triggers on WB URLs, product descriptions, marketplace content - new WB_PRODUCT_PROMPT in generate.py: serious commercial style, structured scenes (hook → features → lifestyle → CTA) - generate_wb_product() function with asset support: photos, videos, text, WB URL - --assets CLI flag for passing multiple asset files/URLs/text to the pipeline - asset_plan output: lists what shots/clips are needed if assets not provided Co-Authored-By: Claude Sonnet 4.6 --- story-gen/preview.py | 54 ++++++++++++++-- story-gen/scripts/generate.py | 112 ++++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 5 deletions(-) diff --git a/story-gen/preview.py b/story-gen/preview.py index 682a133..298312b 100644 --- a/story-gen/preview.py +++ b/story-gen/preview.py @@ -30,15 +30,16 @@ IMAGE_MODEL = os.environ.get("IMAGE_MODEL", "gpt-image-1") # Step 1: classify the request # --------------------------------------------------------------------------- -CLASSIFY_PROMPT = """You are a media production assistant. Classify the user request into one of three types: +CLASSIFY_PROMPT = """You are a media production assistant. Classify the user request into one of four types: -- "video": user wants a video — ad, reel, TikTok, scenario, product showcase, story +- "wb_product": user wants to create a commercial video ad for a Wildberries product — they provide a WB URL, product name, description, photos, or any combination of product assets. Key signals: mentions WB/Wildberries, product URL (wildberries.ru), product specs, wants to sell something on a marketplace. +- "video": user wants a generic video — reel, TikTok, story, scenario — NOT specifically a WB product ad - "trend": user wants a portrait transformation based on a popular visual trend (anime, medieval, doll, soviet era, studio photo, aged self, flowers in hair, etc.) -- "image": user wants a single image of anything that is NOT a portrait trend and NOT a video +- "image": user wants a single image of anything that is NOT a portrait trend, NOT a video, NOT a WB product Return ONLY valid JSON, no markdown: { - "type": "video|trend|image", + "type": "wb_product|video|trend|image", "reason": "one sentence explanation" } """ @@ -318,6 +319,8 @@ def main(): choices=["1024x1024", "1792x1024", "1024x1792"]) parser.add_argument("--photo", default=None, help="Path to your photo (for portrait trends — face will be preserved)") + parser.add_argument("--assets", nargs="*", default=None, + help="Additional assets for WB product: photo paths, video paths, WB URL, text strings") args = parser.parse_args() if not API_KEY: @@ -329,8 +332,49 @@ def main(): request_type, reason = classify(args.input) print(f" Type: {request_type} — {reason}", file=sys.stderr) + # --- WB PRODUCT → professional product commercial scenario --- + if request_type == "wb_product": + print(" Building WB product commercial scenario...", file=sys.stderr) + scripts_dir = os.path.join(os.path.dirname(__file__), "scripts") + sys.path.insert(0, scripts_dir) + import generate as gen + + # Build assets dict from CLI args + assets = {} + if args.photo: + if os.path.exists(args.photo): + assets["photo_1"] = {"type": "photo", "path": args.photo} + else: + assets["photo_1"] = {"type": "photo", "url": args.photo} + if args.assets: + for i, a in enumerate(args.assets, start=(2 if args.photo else 1)): + ext = os.path.splitext(a)[1].lower() + if ext in (".mp4", ".mov", ".avi", ".webm"): + assets[f"video_{i}"] = {"type": "video", "path": a} + elif ext in (".jpg", ".jpeg", ".png", ".webp", ".jfif"): + assets[f"photo_{i}"] = {"type": "photo", "path": a} + elif a.startswith("http"): + if "wildberries.ru" in a or "wb.ru" in a: + assets["wb_url"] = {"type": "url", "url": a} + else: + assets[f"url_{i}"] = {"type": "url", "url": a} + else: + assets[f"text_{i}"] = {"type": "text", "content": a} + + scenario = gen.generate_wb_product( + args.input, + platform=args.platform if args.platform != "auto" else "wb", + lang="ru", + duration=None, + assets=assets if assets else None + ) + print(f" Title: {scenario['title']}", file=sys.stderr) + print(f" Scenes: {len(scenario['scenes'])}", file=sys.stderr) + print(f" Duration: {scenario.get('duration_sec')}s", file=sys.stderr) + print(json.dumps(scenario, ensure_ascii=False, indent=2)) + # --- VIDEO → только сценарий, без генерации картинок --- - if request_type == "video": + elif request_type == "video": print(" Building video scenario...", file=sys.stderr) scripts_dir = os.path.join(os.path.dirname(__file__), "scripts") sys.path.insert(0, scripts_dir) diff --git a/story-gen/scripts/generate.py b/story-gen/scripts/generate.py index a8a2d6b..2745cbc 100755 --- a/story-gen/scripts/generate.py +++ b/story-gen/scripts/generate.py @@ -110,6 +110,118 @@ Response format: }""" +WB_PRODUCT_PROMPT = """You are a senior commercial director specializing in e-commerce video ads for Wildberries. + +You receive product information in ANY form: URL, text description, photo list, video clips description, or any mix. +You return ONLY valid JSON with no markdown or comments. + +STYLE: Professional, serious, commercial. No fluff. Every second sells. +FORMAT: Optimized for Wildberries product card video (16:9, up to 30 sec) or Reels/TikTok (9:16). + +Asset types the user may provide: +- photo: product photo URLs or file paths +- video: existing video clip descriptions or paths +- text: product name, description, specs, USPs +- url: Wildberries product URL + +Rules for scenes: +- Each scene must serve a sales purpose: hook, feature reveal, social proof, CTA +- visual_prompt: ALWAYS in English, describe the exact frame (product placement, lighting, background, text overlay if needed) +- visual_type: "image" (from photo asset), "video_clip" (from video asset), "text_only" (title card / CTA) +- asset_ref: which input asset to use for this scene (e.g. "photo_1", "video_1", null for generated) +- voiceover: confident, benefit-driven, in Russian (unless lang specified) +- caption: short punchy on-screen text, matches voiceover beat + +Scene structure (follow this order): +1. HOOK (2-3 sec): product hero shot, strong opening line — stop the scroll +2. PROBLEM/NEED (3-5 sec): what problem this solves or what desire it fulfills +3. FEATURE 1-3 (3-5 sec each): key product features with close-up visuals +4. LIFESTYLE/RESULT (3-5 sec): product in use, aspirational context +5. CTA (2-3 sec): price, offer, "Купить на Wildberries" / link + +Rules for storyboard_grid_prompt: +- NxN grid of all scenes for consistent look across frames +- Consistent product appearance, lighting style, brand colors throughout +- Set to null if assets are provided (no need to generate) + +Response format: +{ + "title": "product video title", + "format": "wb_ad", + "platform": "wb|instagram|tiktok", + "language": "ru", + "duration_sec": 25, + "hook": "exact opening line — first 3 seconds", + "target_audience": "who buys this product", + "content_restrictions": "WB aspect ratio, no misleading claims", + "scenes": [ + { + "id": 1, + "duration_sec": 3, + "scene_purpose": "hook|problem|feature|lifestyle|cta", + "visual_prompt": "English prompt for image/video generation", + "visual_type": "image|video_clip|text_only", + "asset_ref": "photo_1|video_1|null", + "voiceover": "spoken text in Russian", + "caption": "on-screen text" + } + ], + "storyboard_grid_prompt": "NxN grid prompt or null", + "music_mood": "upbeat|calm|dramatic|inspirational", + "style_notes": "commercial, clean, product-focused", + "asset_plan": { + "photos_needed": ["description of shots needed if no photos provided"], + "videos_needed": ["description of video clips needed"], + "text_overlays": ["list of text cards / CTAs"], + "wb_url": "extracted or provided WB product URL" + } +}""" + + +def generate_wb_product(input_text: str, platform: str = "wb", lang: str = "ru", + duration: int = None, assets: dict = None) -> dict: + """Generate a professional WB product commercial scenario.""" + if not API_KEY: + print("Error: OPENAI_API_KEY not set", file=sys.stderr) + sys.exit(1) + + parts = [f"Product input: {input_text}"] + if platform != "auto": parts.append(f"Platform: {platform}") + if lang != "auto": parts.append(f"Output language: {lang}") + if duration: parts.append(f"Target duration: {duration} seconds") + if assets: parts.append(f"Available assets: {json.dumps(assets, ensure_ascii=False)}") + + payload = json.dumps({ + "model": MODEL, + "messages": [ + {"role": "system", "content": WB_PRODUCT_PROMPT}, + {"role": "user", "content": "\n".join(parts)} + ], + "temperature": 0.6 + }).encode() + + req = request.Request( + f"{API_URL}/chat/completions", + data=payload, + headers={"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"} + ) + + try: + with request.urlopen(req, timeout=90) as resp: + data = json.loads(resp.read()) + except error.URLError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + content = data["choices"][0]["message"]["content"].strip() + if content.startswith("```"): + content = content.split("```")[1] + if content.startswith("json"): + content = content[4:] + + return json.loads(content.strip()) + + ASSET_PROMPT = """You analyze input data for video production. Your task is to extract maximum information from what is given. From 41219baa7ceb94d38462dcfc4362d2a545ceb871 Mon Sep 17 00:00:00 2001 From: AMEfremova Date: Fri, 3 Apr 2026 22:51:21 +0300 Subject: [PATCH 4/4] WB product pipeline: card + video modes, default 1 card, fix image size --- story-gen/preview.py | 69 +++++++++-- story-gen/scripts/generate.py | 224 +++++++++++++++++++++++++++------- 2 files changed, 239 insertions(+), 54 deletions(-) diff --git a/story-gen/preview.py b/story-gen/preview.py index 298312b..6c28778 100644 --- a/story-gen/preview.py +++ b/story-gen/preview.py @@ -321,6 +321,11 @@ def main(): help="Path to your photo (for portrait trends — face will be preserved)") parser.add_argument("--assets", nargs="*", default=None, help="Additional assets for WB product: photo paths, video paths, WB URL, text strings") + parser.add_argument("--mode", default="auto", + choices=["auto", "card", "video"], + help="WB product mode: card=generate infographic images, video=generate video scenario") + parser.add_argument("--cards", type=int, default=1, + help="Number of WB cards to generate (default: 1)") args = parser.parse_args() if not API_KEY: @@ -332,9 +337,8 @@ def main(): request_type, reason = classify(args.input) print(f" Type: {request_type} — {reason}", file=sys.stderr) - # --- WB PRODUCT → professional product commercial scenario --- + # --- WB PRODUCT → card infographics or video scenario --- if request_type == "wb_product": - print(" Building WB product commercial scenario...", file=sys.stderr) scripts_dir = os.path.join(os.path.dirname(__file__), "scripts") sys.path.insert(0, scripts_dir) import generate as gen @@ -360,18 +364,57 @@ def main(): assets[f"url_{i}"] = {"type": "url", "url": a} else: assets[f"text_{i}"] = {"type": "text", "content": a} + assets = assets if assets else None - scenario = gen.generate_wb_product( - args.input, - platform=args.platform if args.platform != "auto" else "wb", - lang="ru", - duration=None, - assets=assets if assets else None - ) - print(f" Title: {scenario['title']}", file=sys.stderr) - print(f" Scenes: {len(scenario['scenes'])}", file=sys.stderr) - print(f" Duration: {scenario.get('duration_sec')}s", file=sys.stderr) - print(json.dumps(scenario, ensure_ascii=False, indent=2)) + # Determine mode: card or video + # Auto-detect: if user mentions "карточка", "инфографика", "фото" → card; "видео", "ролик" → video + wb_mode = args.mode + if wb_mode == "auto": + inp_lower = args.input.lower() + if any(w in inp_lower for w in ["карточк", "инфографик", "фото", "картинк", "изображен", "card"]): + wb_mode = "card" + elif any(w in inp_lower for w in ["видео", "ролик", "video", "reels", "тикток"]): + wb_mode = "video" + else: + wb_mode = "card" # default for WB product is card + + if wb_mode == "card": + print(" Building WB product card series...", file=sys.stderr) + plan = gen.generate_wb_product_cards(args.input, assets, count=args.cards) + product_name = plan.get("product_name", "product") + cards = plan.get("cards", []) + print(f" Product: {product_name}", file=sys.stderr) + print(f" Cards: {len(cards)}", file=sys.stderr) + print(f" Color scheme: {plan.get('color_scheme', '')}", file=sys.stderr) + print(json.dumps(plan, ensure_ascii=False, indent=2)) + + # Generate images for each card + print(f"\n Generating {len(cards)} card images...", file=sys.stderr) + for card in cards: + cid = card["id"] + prompt = card["image_prompt"] + headline = card.get("headline_ru", "") + print(f" Card {cid}: {card['card_type']} — {headline[:40]}", file=sys.stderr) + try: + image_bytes = generate_image(prompt, "1024x1024") # square, closest supported + label = f"wb_card_{cid}_{card['card_type']}_{product_name[:20]}" + save_and_open(image_bytes, label) + except Exception as e: + print(f" Card {cid} generation failed: {e}", file=sys.stderr) + + else: + print(" Building WB product video scenario...", file=sys.stderr) + scenario = gen.generate_wb_product_video( + args.input, + platform=args.platform if args.platform != "auto" else "wb", + lang="ru", + duration=None, + assets=assets + ) + print(f" Title: {scenario['title']}", file=sys.stderr) + print(f" Scenes: {len(scenario['scenes'])}", file=sys.stderr) + print(f" Duration: {scenario.get('duration_sec')}s", file=sys.stderr) + print(json.dumps(scenario, ensure_ascii=False, indent=2)) # --- VIDEO → только сценарий, без генерации картинок --- elif request_type == "video": diff --git a/story-gen/scripts/generate.py b/story-gen/scripts/generate.py index 2745cbc..e868973 100755 --- a/story-gen/scripts/generate.py +++ b/story-gen/scripts/generate.py @@ -110,74 +110,216 @@ Response format: }""" -WB_PRODUCT_PROMPT = """You are a senior commercial director specializing in e-commerce video ads for Wildberries. +WB_PRODUCT_VIDEO_PROMPT = """You are a senior commercial director specializing in e-commerce video ads for Wildberries. -You receive product information in ANY form: URL, text description, photo list, video clips description, or any mix. +You receive product information in ANY form: URL, text description, asset list, or any mix. You return ONLY valid JSON with no markdown or comments. -STYLE: Professional, serious, commercial. No fluff. Every second sells. -FORMAT: Optimized for Wildberries product card video (16:9, up to 30 sec) or Reels/TikTok (9:16). +STYLE: Professional, serious, high-energy commercial. Every second sells. +FORMAT: Wildberries product card video (9:16 vertical, 15-30 sec) or Reels/TikTok. Asset types the user may provide: -- photo: product photo URLs or file paths -- video: existing video clip descriptions or paths +- photo: product photo paths/URLs +- video: existing clip paths/URLs - text: product name, description, specs, USPs - url: Wildberries product URL +VISUAL STYLE for all scenes (mandatory): +- Background: deep dark (near-black, dark charcoal, very dark navy or deep gradient) OR studio white — always solid, never busy +- Product: ultra-bright, hyper-saturated, sharp, glowing with light — MUST pop against background +- Text overlays: bold modern sans-serif, large, high contrast (white or bright accent color on dark, or dark on white) +- Lighting: dramatic product lighting — rim light, spotlight, glossy highlights +- Feel: premium e-commerce, clean, aspirational — like Apple or luxury brand ad + Rules for scenes: -- Each scene must serve a sales purpose: hook, feature reveal, social proof, CTA -- visual_prompt: ALWAYS in English, describe the exact frame (product placement, lighting, background, text overlay if needed) -- visual_type: "image" (from photo asset), "video_clip" (from video asset), "text_only" (title card / CTA) -- asset_ref: which input asset to use for this scene (e.g. "photo_1", "video_1", null for generated) -- voiceover: confident, benefit-driven, in Russian (unless lang specified) -- caption: short punchy on-screen text, matches voiceover beat +- visual_prompt: ALWAYS in English. Include: background color, product description and placement, lighting style, text overlay content and style, overall mood +- visual_type: "image" (generated or from photo), "video_clip" (motion), "text_only" (title/CTA card) +- asset_ref: "photo_1", "video_1", null (to generate) +- voiceover: confident, punchy Russian — short sentences, benefit-driven +- caption: exact Russian text to show on screen, large and bold -Scene structure (follow this order): -1. HOOK (2-3 sec): product hero shot, strong opening line — stop the scroll -2. PROBLEM/NEED (3-5 sec): what problem this solves or what desire it fulfills -3. FEATURE 1-3 (3-5 sec each): key product features with close-up visuals -4. LIFESTYLE/RESULT (3-5 sec): product in use, aspirational context -5. CTA (2-3 sec): price, offer, "Купить на Wildberries" / link - -Rules for storyboard_grid_prompt: -- NxN grid of all scenes for consistent look across frames -- Consistent product appearance, lighting style, brand colors throughout -- Set to null if assets are provided (no need to generate) +Scene structure: +1. HERO (3 sec): full product shot, ultra-bright, dramatic dark background, product name in bold +2. PROBLEM (3-4 sec): relatable pain point, minimal visual, strong text +3. FEATURE 1 (3-4 sec): close-up of key feature, spec text overlay +4. FEATURE 2 (3-4 sec): second key feature or benefit +5. LIFESTYLE (3-4 sec): product in aspirational real-world use +6. CTA (2-3 sec): dark bg, product + price text + "Wildberries" branding Response format: { - "title": "product video title", + "title": "video title", "format": "wb_ad", - "platform": "wb|instagram|tiktok", + "platform": "wb", "language": "ru", - "duration_sec": 25, - "hook": "exact opening line — first 3 seconds", - "target_audience": "who buys this product", - "content_restrictions": "WB aspect ratio, no misleading claims", + "duration_sec": 22, + "hook": "opening hook line", + "target_audience": "...", "scenes": [ { "id": 1, "duration_sec": 3, - "scene_purpose": "hook|problem|feature|lifestyle|cta", - "visual_prompt": "English prompt for image/video generation", + "scene_purpose": "hero|problem|feature|lifestyle|cta", + "visual_prompt": "English image generation prompt with full visual details", "visual_type": "image|video_clip|text_only", - "asset_ref": "photo_1|video_1|null", - "voiceover": "spoken text in Russian", - "caption": "on-screen text" + "asset_ref": "photo_1|null", + "voiceover": "Russian spoken text", + "caption": "Russian on-screen text — bold, short" } ], - "storyboard_grid_prompt": "NxN grid prompt or null", - "music_mood": "upbeat|calm|dramatic|inspirational", - "style_notes": "commercial, clean, product-focused", + "storyboard_grid_prompt": "NxN storyboard grid prompt or null", + "music_mood": "upbeat|dramatic|inspirational", + "style_notes": "dark bg, hyper-bright product, bold typography", "asset_plan": { - "photos_needed": ["description of shots needed if no photos provided"], - "videos_needed": ["description of video clips needed"], - "text_overlays": ["list of text cards / CTAs"], - "wb_url": "extracted or provided WB product URL" + "photos_needed": ["list of product shots needed if no assets provided"], + "videos_needed": ["list of video clips needed"], + "wb_url": "extracted WB URL or null" } }""" +WB_CARD_PROMPT = """You are a professional e-commerce graphic designer specializing in Wildberries product card infographics. + +You receive product information and generate a plan for a series of product card images (infographics). +You return ONLY valid JSON with no markdown or comments. + +WILDBERRIES CARD VISUAL STYLE (strict): +- Format: vertical 3:4 (900x1200px equivalent) +- Background: very dark (near-black #0a0a0a, dark charcoal #1a1a1a, deep navy #0d1b2a, or rich dark gradient) — creates maximum contrast +- Product: ultra-bright, hyper-saturated, sharp studio shot — GLOWING against dark background, as if lit from within +- Typography: bold modern sans-serif (like Montserrat Bold, Bebas Neue). Large, commanding. White or bright accent (electric blue, neon green, vivid orange, hot pink) on dark background +- Text hierarchy: HUGE headline (product name / main benefit), medium subtitle, small spec details +- Layout: product takes 50-70% of frame, text occupies remaining space in clear zones +- Infographic elements: icons, dividers, spec badges, numbered lists — all clean and minimal +- Feel: premium, powerful, high-converting — like top Wildberries sellers' cards + +Card count: generate exactly as many cards as requested. Default is 1 (HERO card only). +If count=1: generate only the HERO card. +If count>1: generate that many cards following this sequence: +1. HERO card: full product shot center, product name huge at top, main tagline at bottom. Most dramatic lighting. +2. FEATURES card: product + 3-4 key specs as icon+text blocks around it +3. BENEFIT 1 card: close-up detail, ONE big benefit headline, short explanation +4. BENEFIT 2 card: different angle or use case, another key benefit +5. COMPARISON/PROOF card: "почему мы" — size chart, material callout, or social proof numbers +6. CTA card: product + price area (leave price blank if unknown) + "Купить на Wildberries" + +For each card generate: +- image_prompt: ENGLISH, full detailed prompt for gpt-image-1 image generation + Must include: exact background color (e.g. "very dark charcoal background #1a1a2e"), product description and position, + lighting style (studio rim light, spotlight from above, etc.), text overlay description (content, size, placement, color), + visual style tags +- headline_ru: main Russian text for this card (large, bold) +- subtext_ru: secondary Russian text (smaller) +- design_notes: specific layout or design instructions in Russian + +Response format: +{ + "product_name": "extracted product name", + "product_category": "category", + "color_scheme": "describe the dark bg color and accent color chosen", + "typography_style": "font style description", + "cards": [ + { + "id": 1, + "card_type": "hero|features|benefit|comparison|cta", + "image_prompt": "full English prompt for AI image generation", + "headline_ru": "ГЛАВНЫЙ ЗАГОЛОВОК", + "subtext_ru": "пояснительный текст", + "design_notes": "инструкции по верстке" + } + ], + "brand_notes": "overall visual brand guidance for consistency" +}""" + + +def generate_wb_product_video(input_text: str, platform: str = "wb", lang: str = "ru", + duration: int = None, assets: dict = None) -> dict: + """Generate a professional WB product commercial video scenario.""" + if not API_KEY: + print("Error: OPENAI_API_KEY not set", file=sys.stderr) + sys.exit(1) + + parts = [f"Product input: {input_text}"] + if platform != "auto": parts.append(f"Platform: {platform}") + if lang != "auto": parts.append(f"Output language: {lang}") + if duration: parts.append(f"Target duration: {duration} seconds") + if assets: parts.append(f"Available assets: {json.dumps(assets, ensure_ascii=False)}") + + payload = json.dumps({ + "model": MODEL, + "messages": [ + {"role": "system", "content": WB_PRODUCT_VIDEO_PROMPT}, + {"role": "user", "content": "\n".join(parts)} + ], + "temperature": 0.6 + }).encode() + + req = request.Request( + f"{API_URL}/chat/completions", + data=payload, + headers={"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"} + ) + try: + with request.urlopen(req, timeout=90) as resp: + data = json.loads(resp.read()) + except error.URLError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + content = data["choices"][0]["message"]["content"].strip() + if content.startswith("```"): + content = content.split("```")[1] + if content.startswith("json"): + content = content[4:] + return json.loads(content.strip()) + + +def generate_wb_product_cards(input_text: str, assets: dict = None, count: int = 1) -> dict: + """Generate a WB product card infographic series plan.""" + if not API_KEY: + print("Error: OPENAI_API_KEY not set", file=sys.stderr) + sys.exit(1) + + parts = [f"Product input: {input_text}"] + parts.append(f"Number of cards to generate: {count}") + if assets: + parts.append(f"Available assets: {json.dumps(assets, ensure_ascii=False)}") + + payload = json.dumps({ + "model": MODEL, + "messages": [ + {"role": "system", "content": WB_CARD_PROMPT}, + {"role": "user", "content": "\n".join(parts)} + ], + "temperature": 0.5 + }).encode() + + req = request.Request( + f"{API_URL}/chat/completions", + data=payload, + headers={"Authorization": f"Bearer {API_KEY}", "Content-Type": "application/json"} + ) + try: + with request.urlopen(req, timeout=90) as resp: + data = json.loads(resp.read()) + except error.URLError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + content = data["choices"][0]["message"]["content"].strip() + if content.startswith("```"): + content = content.split("```")[1] + if content.startswith("json"): + content = content[4:] + return json.loads(content.strip()) + + +# Keep old name as alias for backwards compatibility +def generate_wb_product(input_text: str, platform: str = "wb", lang: str = "ru", + duration: int = None, assets: dict = None) -> dict: + return generate_wb_product_video(input_text, platform, lang, duration, assets) + + def generate_wb_product(input_text: str, platform: str = "wb", lang: str = "ru", duration: int = None, assets: dict = None) -> dict: """Generate a professional WB product commercial scenario."""