feat(whatsapp): native media sending — images, videos, documents

Add a /send-media endpoint to the WhatsApp bridge and corresponding
adapter methods so the agent can send files as native WhatsApp
attachments instead of plain-text URLs/paths.

- bridge.js: new POST /send-media endpoint using Baileys' native
  image/video/document/audio message types with MIME detection
- base.py: add send_video(), send_document(), send_image_file()
  with text fallbacks; route MEDIA: tags by file extension instead
  of always treating them as voice messages
- whatsapp.py: implement all media methods via a shared
  _send_media_to_bridge() helper; override send_image() to download
  URLs to local cache and send as native photos
- prompt_builder.py: update WhatsApp and Telegram platform hints so
  the agent knows it can use MEDIA:/path tags to send native media
This commit is contained in:
Daniel Sateler 2026-03-02 16:34:49 -03:00
parent 3c13feed4c
commit 3588396263
4 changed files with 272 additions and 15 deletions

View file

@ -90,11 +90,23 @@ SKILLS_GUIDANCE = (
PLATFORM_HINTS = { PLATFORM_HINTS = {
"whatsapp": ( "whatsapp": (
"You are on a text messaging communication platform, WhatsApp. " "You are on a text messaging communication platform, WhatsApp. "
"Please do not use markdown as it does not render." "Please do not use markdown as it does not render. "
"You can send media files natively: to deliver a file to the user, "
"include MEDIA:/absolute/path/to/file in your response. The file "
"will be sent as a native WhatsApp attachment — images (.jpg, .png, "
".webp) appear as photos, videos (.mp4, .mov) play inline, and other "
"files arrive as downloadable documents. You can also include image "
"URLs in markdown format ![alt](url) and they will be sent as photos."
), ),
"telegram": ( "telegram": (
"You are on a text messaging communication platform, Telegram. " "You are on a text messaging communication platform, Telegram. "
"Please do not use markdown as it does not render." "Please do not use markdown as it does not render. "
"You can send media files natively: to deliver a file to the user, "
"include MEDIA:/absolute/path/to/file in your response. Images "
"(.jpg, .png) appear as photos, videos (.mp4) play inline, audio "
"(.ogg) sends as voice bubbles, and other files as documents. You "
"can also include image URLs in markdown format ![alt](url) and they "
"will be sent as native photos."
), ),
"discord": ( "discord": (
"You are in a Discord server or group chat communicating with your user." "You are in a Discord server or group chat communicating with your user."

View file

@ -509,7 +509,63 @@ class BasePlatformAdapter(ABC):
if caption: if caption:
text = f"{caption}\n{text}" text = f"{caption}\n{text}"
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
async def send_video(
self,
chat_id: str,
video_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""
Send a video natively via the platform API.
Override in subclasses to send videos as inline playable media.
Default falls back to sending the file path as text.
"""
text = f"🎬 Video: {video_path}"
if caption:
text = f"{caption}\n{text}"
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
async def send_document(
self,
chat_id: str,
file_path: str,
caption: Optional[str] = None,
file_name: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""
Send a document/file natively via the platform API.
Override in subclasses to send files as downloadable attachments.
Default falls back to sending the file path as text.
"""
text = f"📎 File: {file_path}"
if caption:
text = f"{caption}\n{text}"
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
async def send_image_file(
self,
chat_id: str,
image_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""
Send a local image file natively via the platform API.
Unlike send_image() which takes a URL, this takes a local file path.
Override in subclasses for native photo attachments.
Default falls back to sending the file path as text.
"""
text = f"🖼️ Image: {image_path}"
if caption:
text = f"{caption}\n{text}"
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
@staticmethod @staticmethod
def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]: def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
""" """
@ -676,19 +732,42 @@ class BasePlatformAdapter(ABC):
except Exception as img_err: except Exception as img_err:
print(f"[{self.name}] Error sending image: {img_err}") print(f"[{self.name}] Error sending image: {img_err}")
# Send extracted audio/voice files as native attachments # Send extracted media files — route by file type
for audio_path, is_voice in media_files: _AUDIO_EXTS = {'.ogg', '.opus', '.mp3', '.wav', '.m4a'}
_VIDEO_EXTS = {'.mp4', '.mov', '.avi', '.mkv', '.3gp'}
_IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
for media_path, is_voice in media_files:
if human_delay > 0: if human_delay > 0:
await asyncio.sleep(human_delay) await asyncio.sleep(human_delay)
try: try:
voice_result = await self.send_voice( from pathlib import Path as _Path
chat_id=event.source.chat_id, ext = _Path(media_path).suffix.lower()
audio_path=audio_path, if ext in _AUDIO_EXTS:
) media_result = await self.send_voice(
if not voice_result.success: chat_id=event.source.chat_id,
print(f"[{self.name}] Failed to send voice: {voice_result.error}") audio_path=media_path,
except Exception as voice_err: )
print(f"[{self.name}] Error sending voice: {voice_err}") elif ext in _VIDEO_EXTS:
media_result = await self.send_video(
chat_id=event.source.chat_id,
video_path=media_path,
)
elif ext in _IMAGE_EXTS:
media_result = await self.send_image_file(
chat_id=event.source.chat_id,
image_path=media_path,
)
else:
media_result = await self.send_document(
chat_id=event.source.chat_id,
file_path=media_path,
)
if not media_result.success:
print(f"[{self.name}] Failed to send media ({ext}): {media_result.error}")
except Exception as media_err:
print(f"[{self.name}] Error sending media: {media_err}")
# Check if there's a pending message that was queued during our processing # Check if there's a pending message that was queued during our processing
if session_key in self._pending_messages: if session_key in self._pending_messages:

View file

@ -281,7 +281,102 @@ class WhatsAppAdapter(BasePlatformAdapter):
) )
except Exception as e: except Exception as e:
return SendResult(success=False, error=str(e)) return SendResult(success=False, error=str(e))
async def _send_media_to_bridge(
self,
chat_id: str,
file_path: str,
media_type: str,
caption: Optional[str] = None,
file_name: Optional[str] = None,
) -> SendResult:
"""Send any media file via bridge /send-media endpoint."""
if not self._running:
return SendResult(success=False, error="Not connected")
try:
import aiohttp
if not os.path.exists(file_path):
return SendResult(success=False, error=f"File not found: {file_path}")
payload: Dict[str, Any] = {
"chatId": chat_id,
"filePath": file_path,
"mediaType": media_type,
}
if caption:
payload["caption"] = caption
if file_name:
payload["fileName"] = file_name
async with aiohttp.ClientSession() as session:
async with session.post(
f"http://localhost:{self._bridge_port}/send-media",
json=payload,
timeout=aiohttp.ClientTimeout(total=120),
) as resp:
if resp.status == 200:
data = await resp.json()
return SendResult(
success=True,
message_id=data.get("messageId"),
raw_response=data,
)
else:
error = await resp.text()
return SendResult(success=False, error=error)
except Exception as e:
return SendResult(success=False, error=str(e))
async def send_image(
self,
chat_id: str,
image_url: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Download image URL to cache, send natively via bridge."""
try:
local_path = await cache_image_from_url(image_url)
return await self._send_media_to_bridge(chat_id, local_path, "image", caption)
except Exception:
return await super().send_image(chat_id, image_url, caption, reply_to)
async def send_image_file(
self,
chat_id: str,
image_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Send a local image file natively via bridge."""
return await self._send_media_to_bridge(chat_id, image_path, "image", caption)
async def send_video(
self,
chat_id: str,
video_path: str,
caption: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Send a video natively via bridge — plays inline in WhatsApp."""
return await self._send_media_to_bridge(chat_id, video_path, "video", caption)
async def send_document(
self,
chat_id: str,
file_path: str,
caption: Optional[str] = None,
file_name: Optional[str] = None,
reply_to: Optional[str] = None,
) -> SendResult:
"""Send a document/file as a downloadable attachment via bridge."""
return await self._send_media_to_bridge(
chat_id, file_path, "document", caption,
file_name or os.path.basename(file_path),
)
async def send_typing(self, chat_id: str) -> None: async def send_typing(self, chat_id: str) -> None:
"""Send typing indicator via bridge.""" """Send typing indicator via bridge."""
if not self._running: if not self._running:

View file

@ -8,6 +8,7 @@
* Endpoints (matches gateway/platforms/whatsapp.py expectations): * Endpoints (matches gateway/platforms/whatsapp.py expectations):
* GET /messages - Long-poll for new incoming messages * GET /messages - Long-poll for new incoming messages
* POST /send - Send a message { chatId, message, replyTo? } * POST /send - Send a message { chatId, message, replyTo? }
* POST /send-media - Send media natively { chatId, filePath, mediaType?, caption?, fileName? }
* POST /typing - Send typing indicator { chatId } * POST /typing - Send typing indicator { chatId }
* GET /chat/:id - Get chat info * GET /chat/:id - Get chat info
* GET /health - Health check * GET /health - Health check
@ -21,7 +22,7 @@ import express from 'express';
import { Boom } from '@hapi/boom'; import { Boom } from '@hapi/boom';
import pino from 'pino'; import pino from 'pino';
import path from 'path'; import path from 'path';
import { mkdirSync } from 'fs'; import { mkdirSync, readFileSync, existsSync } from 'fs';
import qrcode from 'qrcode-terminal'; import qrcode from 'qrcode-terminal';
// Parse CLI args // Parse CLI args
@ -210,6 +211,76 @@ app.post('/send', async (req, res) => {
} }
}); });
// MIME type map and media type inference for /send-media
const MIME_MAP = {
jpg: 'image/jpeg', jpeg: 'image/jpeg', png: 'image/png',
webp: 'image/webp', gif: 'image/gif',
mp4: 'video/mp4', mov: 'video/quicktime', avi: 'video/x-msvideo',
mkv: 'video/x-matroska', '3gp': 'video/3gpp',
pdf: 'application/pdf',
doc: 'application/msword',
docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
xlsx: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
};
function inferMediaType(ext) {
if (['jpg', 'jpeg', 'png', 'webp', 'gif'].includes(ext)) return 'image';
if (['mp4', 'mov', 'avi', 'mkv', '3gp'].includes(ext)) return 'video';
if (['ogg', 'opus', 'mp3', 'wav', 'm4a'].includes(ext)) return 'audio';
return 'document';
}
// Send media (image, video, document) natively
app.post('/send-media', async (req, res) => {
if (!sock || connectionState !== 'connected') {
return res.status(503).json({ error: 'Not connected to WhatsApp' });
}
const { chatId, filePath, mediaType, caption, fileName } = req.body;
if (!chatId || !filePath) {
return res.status(400).json({ error: 'chatId and filePath are required' });
}
try {
if (!existsSync(filePath)) {
return res.status(404).json({ error: `File not found: ${filePath}` });
}
const buffer = readFileSync(filePath);
const ext = filePath.toLowerCase().split('.').pop();
const type = mediaType || inferMediaType(ext);
let msgPayload;
switch (type) {
case 'image':
msgPayload = { image: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'image/jpeg' };
break;
case 'video':
msgPayload = { video: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'video/mp4' };
break;
case 'audio': {
const audioMime = (ext === 'ogg' || ext === 'opus') ? 'audio/ogg; codecs=opus' : 'audio/mpeg';
msgPayload = { audio: buffer, mimetype: audioMime, ptt: ext === 'ogg' || ext === 'opus' };
break;
}
case 'document':
default:
msgPayload = {
document: buffer,
fileName: fileName || path.basename(filePath),
caption: caption || undefined,
mimetype: MIME_MAP[ext] || 'application/octet-stream',
};
break;
}
const sent = await sock.sendMessage(chatId, msgPayload);
res.json({ success: true, messageId: sent?.key?.id });
} catch (err) {
res.status(500).json({ error: err.message });
}
});
// Typing indicator // Typing indicator
app.post('/typing', async (req, res) => { app.post('/typing', async (req, res) => {
if (!sock || connectionState !== 'connected') { if (!sock || connectionState !== 'connected') {