feat(whatsapp): native media sending — images, videos, documents
Add a /send-media endpoint to the WhatsApp bridge and corresponding adapter methods so the agent can send files as native WhatsApp attachments instead of plain-text URLs/paths. - bridge.js: new POST /send-media endpoint using Baileys' native image/video/document/audio message types with MIME detection - base.py: add send_video(), send_document(), send_image_file() with text fallbacks; route MEDIA: tags by file extension instead of always treating them as voice messages - whatsapp.py: implement all media methods via a shared _send_media_to_bridge() helper; override send_image() to download URLs to local cache and send as native photos - prompt_builder.py: update WhatsApp and Telegram platform hints so the agent knows it can use MEDIA:/path tags to send native media
This commit is contained in:
parent
3c13feed4c
commit
3588396263
4 changed files with 272 additions and 15 deletions
|
|
@ -90,11 +90,23 @@ SKILLS_GUIDANCE = (
|
||||||
PLATFORM_HINTS = {
|
PLATFORM_HINTS = {
|
||||||
"whatsapp": (
|
"whatsapp": (
|
||||||
"You are on a text messaging communication platform, WhatsApp. "
|
"You are on a text messaging communication platform, WhatsApp. "
|
||||||
"Please do not use markdown as it does not render."
|
"Please do not use markdown as it does not render. "
|
||||||
|
"You can send media files natively: to deliver a file to the user, "
|
||||||
|
"include MEDIA:/absolute/path/to/file in your response. The file "
|
||||||
|
"will be sent as a native WhatsApp attachment — images (.jpg, .png, "
|
||||||
|
".webp) appear as photos, videos (.mp4, .mov) play inline, and other "
|
||||||
|
"files arrive as downloadable documents. You can also include image "
|
||||||
|
"URLs in markdown format  and they will be sent as photos."
|
||||||
),
|
),
|
||||||
"telegram": (
|
"telegram": (
|
||||||
"You are on a text messaging communication platform, Telegram. "
|
"You are on a text messaging communication platform, Telegram. "
|
||||||
"Please do not use markdown as it does not render."
|
"Please do not use markdown as it does not render. "
|
||||||
|
"You can send media files natively: to deliver a file to the user, "
|
||||||
|
"include MEDIA:/absolute/path/to/file in your response. Images "
|
||||||
|
"(.jpg, .png) appear as photos, videos (.mp4) play inline, audio "
|
||||||
|
"(.ogg) sends as voice bubbles, and other files as documents. You "
|
||||||
|
"can also include image URLs in markdown format  and they "
|
||||||
|
"will be sent as native photos."
|
||||||
),
|
),
|
||||||
"discord": (
|
"discord": (
|
||||||
"You are in a Discord server or group chat communicating with your user."
|
"You are in a Discord server or group chat communicating with your user."
|
||||||
|
|
|
||||||
|
|
@ -510,6 +510,62 @@ class BasePlatformAdapter(ABC):
|
||||||
text = f"{caption}\n{text}"
|
text = f"{caption}\n{text}"
|
||||||
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
|
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
|
||||||
|
|
||||||
|
async def send_video(
|
||||||
|
self,
|
||||||
|
chat_id: str,
|
||||||
|
video_path: str,
|
||||||
|
caption: Optional[str] = None,
|
||||||
|
reply_to: Optional[str] = None,
|
||||||
|
) -> SendResult:
|
||||||
|
"""
|
||||||
|
Send a video natively via the platform API.
|
||||||
|
|
||||||
|
Override in subclasses to send videos as inline playable media.
|
||||||
|
Default falls back to sending the file path as text.
|
||||||
|
"""
|
||||||
|
text = f"🎬 Video: {video_path}"
|
||||||
|
if caption:
|
||||||
|
text = f"{caption}\n{text}"
|
||||||
|
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
|
||||||
|
|
||||||
|
async def send_document(
|
||||||
|
self,
|
||||||
|
chat_id: str,
|
||||||
|
file_path: str,
|
||||||
|
caption: Optional[str] = None,
|
||||||
|
file_name: Optional[str] = None,
|
||||||
|
reply_to: Optional[str] = None,
|
||||||
|
) -> SendResult:
|
||||||
|
"""
|
||||||
|
Send a document/file natively via the platform API.
|
||||||
|
|
||||||
|
Override in subclasses to send files as downloadable attachments.
|
||||||
|
Default falls back to sending the file path as text.
|
||||||
|
"""
|
||||||
|
text = f"📎 File: {file_path}"
|
||||||
|
if caption:
|
||||||
|
text = f"{caption}\n{text}"
|
||||||
|
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
|
||||||
|
|
||||||
|
async def send_image_file(
|
||||||
|
self,
|
||||||
|
chat_id: str,
|
||||||
|
image_path: str,
|
||||||
|
caption: Optional[str] = None,
|
||||||
|
reply_to: Optional[str] = None,
|
||||||
|
) -> SendResult:
|
||||||
|
"""
|
||||||
|
Send a local image file natively via the platform API.
|
||||||
|
|
||||||
|
Unlike send_image() which takes a URL, this takes a local file path.
|
||||||
|
Override in subclasses for native photo attachments.
|
||||||
|
Default falls back to sending the file path as text.
|
||||||
|
"""
|
||||||
|
text = f"🖼️ Image: {image_path}"
|
||||||
|
if caption:
|
||||||
|
text = f"{caption}\n{text}"
|
||||||
|
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
|
def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
|
||||||
"""
|
"""
|
||||||
|
|
@ -676,19 +732,42 @@ class BasePlatformAdapter(ABC):
|
||||||
except Exception as img_err:
|
except Exception as img_err:
|
||||||
print(f"[{self.name}] Error sending image: {img_err}")
|
print(f"[{self.name}] Error sending image: {img_err}")
|
||||||
|
|
||||||
# Send extracted audio/voice files as native attachments
|
# Send extracted media files — route by file type
|
||||||
for audio_path, is_voice in media_files:
|
_AUDIO_EXTS = {'.ogg', '.opus', '.mp3', '.wav', '.m4a'}
|
||||||
|
_VIDEO_EXTS = {'.mp4', '.mov', '.avi', '.mkv', '.3gp'}
|
||||||
|
_IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
|
||||||
|
|
||||||
|
for media_path, is_voice in media_files:
|
||||||
if human_delay > 0:
|
if human_delay > 0:
|
||||||
await asyncio.sleep(human_delay)
|
await asyncio.sleep(human_delay)
|
||||||
try:
|
try:
|
||||||
voice_result = await self.send_voice(
|
from pathlib import Path as _Path
|
||||||
chat_id=event.source.chat_id,
|
ext = _Path(media_path).suffix.lower()
|
||||||
audio_path=audio_path,
|
if ext in _AUDIO_EXTS:
|
||||||
)
|
media_result = await self.send_voice(
|
||||||
if not voice_result.success:
|
chat_id=event.source.chat_id,
|
||||||
print(f"[{self.name}] Failed to send voice: {voice_result.error}")
|
audio_path=media_path,
|
||||||
except Exception as voice_err:
|
)
|
||||||
print(f"[{self.name}] Error sending voice: {voice_err}")
|
elif ext in _VIDEO_EXTS:
|
||||||
|
media_result = await self.send_video(
|
||||||
|
chat_id=event.source.chat_id,
|
||||||
|
video_path=media_path,
|
||||||
|
)
|
||||||
|
elif ext in _IMAGE_EXTS:
|
||||||
|
media_result = await self.send_image_file(
|
||||||
|
chat_id=event.source.chat_id,
|
||||||
|
image_path=media_path,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
media_result = await self.send_document(
|
||||||
|
chat_id=event.source.chat_id,
|
||||||
|
file_path=media_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not media_result.success:
|
||||||
|
print(f"[{self.name}] Failed to send media ({ext}): {media_result.error}")
|
||||||
|
except Exception as media_err:
|
||||||
|
print(f"[{self.name}] Error sending media: {media_err}")
|
||||||
|
|
||||||
# Check if there's a pending message that was queued during our processing
|
# Check if there's a pending message that was queued during our processing
|
||||||
if session_key in self._pending_messages:
|
if session_key in self._pending_messages:
|
||||||
|
|
|
||||||
|
|
@ -282,6 +282,101 @@ class WhatsAppAdapter(BasePlatformAdapter):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return SendResult(success=False, error=str(e))
|
return SendResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
async def _send_media_to_bridge(
|
||||||
|
self,
|
||||||
|
chat_id: str,
|
||||||
|
file_path: str,
|
||||||
|
media_type: str,
|
||||||
|
caption: Optional[str] = None,
|
||||||
|
file_name: Optional[str] = None,
|
||||||
|
) -> SendResult:
|
||||||
|
"""Send any media file via bridge /send-media endpoint."""
|
||||||
|
if not self._running:
|
||||||
|
return SendResult(success=False, error="Not connected")
|
||||||
|
try:
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
return SendResult(success=False, error=f"File not found: {file_path}")
|
||||||
|
|
||||||
|
payload: Dict[str, Any] = {
|
||||||
|
"chatId": chat_id,
|
||||||
|
"filePath": file_path,
|
||||||
|
"mediaType": media_type,
|
||||||
|
}
|
||||||
|
if caption:
|
||||||
|
payload["caption"] = caption
|
||||||
|
if file_name:
|
||||||
|
payload["fileName"] = file_name
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.post(
|
||||||
|
f"http://localhost:{self._bridge_port}/send-media",
|
||||||
|
json=payload,
|
||||||
|
timeout=aiohttp.ClientTimeout(total=120),
|
||||||
|
) as resp:
|
||||||
|
if resp.status == 200:
|
||||||
|
data = await resp.json()
|
||||||
|
return SendResult(
|
||||||
|
success=True,
|
||||||
|
message_id=data.get("messageId"),
|
||||||
|
raw_response=data,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
error = await resp.text()
|
||||||
|
return SendResult(success=False, error=error)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return SendResult(success=False, error=str(e))
|
||||||
|
|
||||||
|
async def send_image(
|
||||||
|
self,
|
||||||
|
chat_id: str,
|
||||||
|
image_url: str,
|
||||||
|
caption: Optional[str] = None,
|
||||||
|
reply_to: Optional[str] = None,
|
||||||
|
) -> SendResult:
|
||||||
|
"""Download image URL to cache, send natively via bridge."""
|
||||||
|
try:
|
||||||
|
local_path = await cache_image_from_url(image_url)
|
||||||
|
return await self._send_media_to_bridge(chat_id, local_path, "image", caption)
|
||||||
|
except Exception:
|
||||||
|
return await super().send_image(chat_id, image_url, caption, reply_to)
|
||||||
|
|
||||||
|
async def send_image_file(
|
||||||
|
self,
|
||||||
|
chat_id: str,
|
||||||
|
image_path: str,
|
||||||
|
caption: Optional[str] = None,
|
||||||
|
reply_to: Optional[str] = None,
|
||||||
|
) -> SendResult:
|
||||||
|
"""Send a local image file natively via bridge."""
|
||||||
|
return await self._send_media_to_bridge(chat_id, image_path, "image", caption)
|
||||||
|
|
||||||
|
async def send_video(
|
||||||
|
self,
|
||||||
|
chat_id: str,
|
||||||
|
video_path: str,
|
||||||
|
caption: Optional[str] = None,
|
||||||
|
reply_to: Optional[str] = None,
|
||||||
|
) -> SendResult:
|
||||||
|
"""Send a video natively via bridge — plays inline in WhatsApp."""
|
||||||
|
return await self._send_media_to_bridge(chat_id, video_path, "video", caption)
|
||||||
|
|
||||||
|
async def send_document(
|
||||||
|
self,
|
||||||
|
chat_id: str,
|
||||||
|
file_path: str,
|
||||||
|
caption: Optional[str] = None,
|
||||||
|
file_name: Optional[str] = None,
|
||||||
|
reply_to: Optional[str] = None,
|
||||||
|
) -> SendResult:
|
||||||
|
"""Send a document/file as a downloadable attachment via bridge."""
|
||||||
|
return await self._send_media_to_bridge(
|
||||||
|
chat_id, file_path, "document", caption,
|
||||||
|
file_name or os.path.basename(file_path),
|
||||||
|
)
|
||||||
|
|
||||||
async def send_typing(self, chat_id: str) -> None:
|
async def send_typing(self, chat_id: str) -> None:
|
||||||
"""Send typing indicator via bridge."""
|
"""Send typing indicator via bridge."""
|
||||||
if not self._running:
|
if not self._running:
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,7 @@
|
||||||
* Endpoints (matches gateway/platforms/whatsapp.py expectations):
|
* Endpoints (matches gateway/platforms/whatsapp.py expectations):
|
||||||
* GET /messages - Long-poll for new incoming messages
|
* GET /messages - Long-poll for new incoming messages
|
||||||
* POST /send - Send a message { chatId, message, replyTo? }
|
* POST /send - Send a message { chatId, message, replyTo? }
|
||||||
|
* POST /send-media - Send media natively { chatId, filePath, mediaType?, caption?, fileName? }
|
||||||
* POST /typing - Send typing indicator { chatId }
|
* POST /typing - Send typing indicator { chatId }
|
||||||
* GET /chat/:id - Get chat info
|
* GET /chat/:id - Get chat info
|
||||||
* GET /health - Health check
|
* GET /health - Health check
|
||||||
|
|
@ -21,7 +22,7 @@ import express from 'express';
|
||||||
import { Boom } from '@hapi/boom';
|
import { Boom } from '@hapi/boom';
|
||||||
import pino from 'pino';
|
import pino from 'pino';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
import { mkdirSync } from 'fs';
|
import { mkdirSync, readFileSync, existsSync } from 'fs';
|
||||||
import qrcode from 'qrcode-terminal';
|
import qrcode from 'qrcode-terminal';
|
||||||
|
|
||||||
// Parse CLI args
|
// Parse CLI args
|
||||||
|
|
@ -210,6 +211,76 @@ app.post('/send', async (req, res) => {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// MIME type map and media type inference for /send-media
|
||||||
|
const MIME_MAP = {
|
||||||
|
jpg: 'image/jpeg', jpeg: 'image/jpeg', png: 'image/png',
|
||||||
|
webp: 'image/webp', gif: 'image/gif',
|
||||||
|
mp4: 'video/mp4', mov: 'video/quicktime', avi: 'video/x-msvideo',
|
||||||
|
mkv: 'video/x-matroska', '3gp': 'video/3gpp',
|
||||||
|
pdf: 'application/pdf',
|
||||||
|
doc: 'application/msword',
|
||||||
|
docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||||
|
xlsx: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||||
|
};
|
||||||
|
|
||||||
|
function inferMediaType(ext) {
|
||||||
|
if (['jpg', 'jpeg', 'png', 'webp', 'gif'].includes(ext)) return 'image';
|
||||||
|
if (['mp4', 'mov', 'avi', 'mkv', '3gp'].includes(ext)) return 'video';
|
||||||
|
if (['ogg', 'opus', 'mp3', 'wav', 'm4a'].includes(ext)) return 'audio';
|
||||||
|
return 'document';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send media (image, video, document) natively
|
||||||
|
app.post('/send-media', async (req, res) => {
|
||||||
|
if (!sock || connectionState !== 'connected') {
|
||||||
|
return res.status(503).json({ error: 'Not connected to WhatsApp' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const { chatId, filePath, mediaType, caption, fileName } = req.body;
|
||||||
|
if (!chatId || !filePath) {
|
||||||
|
return res.status(400).json({ error: 'chatId and filePath are required' });
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (!existsSync(filePath)) {
|
||||||
|
return res.status(404).json({ error: `File not found: ${filePath}` });
|
||||||
|
}
|
||||||
|
|
||||||
|
const buffer = readFileSync(filePath);
|
||||||
|
const ext = filePath.toLowerCase().split('.').pop();
|
||||||
|
const type = mediaType || inferMediaType(ext);
|
||||||
|
let msgPayload;
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case 'image':
|
||||||
|
msgPayload = { image: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'image/jpeg' };
|
||||||
|
break;
|
||||||
|
case 'video':
|
||||||
|
msgPayload = { video: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'video/mp4' };
|
||||||
|
break;
|
||||||
|
case 'audio': {
|
||||||
|
const audioMime = (ext === 'ogg' || ext === 'opus') ? 'audio/ogg; codecs=opus' : 'audio/mpeg';
|
||||||
|
msgPayload = { audio: buffer, mimetype: audioMime, ptt: ext === 'ogg' || ext === 'opus' };
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'document':
|
||||||
|
default:
|
||||||
|
msgPayload = {
|
||||||
|
document: buffer,
|
||||||
|
fileName: fileName || path.basename(filePath),
|
||||||
|
caption: caption || undefined,
|
||||||
|
mimetype: MIME_MAP[ext] || 'application/octet-stream',
|
||||||
|
};
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const sent = await sock.sendMessage(chatId, msgPayload);
|
||||||
|
res.json({ success: true, messageId: sent?.key?.id });
|
||||||
|
} catch (err) {
|
||||||
|
res.status(500).json({ error: err.message });
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
// Typing indicator
|
// Typing indicator
|
||||||
app.post('/typing', async (req, res) => {
|
app.post('/typing', async (req, res) => {
|
||||||
if (!sock || connectionState !== 'connected') {
|
if (!sock || connectionState !== 'connected') {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue