190 lines
7.4 KiB
HTML
190 lines
7.4 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="zh-CN">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>VLM </title>
|
|
<style>
|
|
body { font-family: sans-serif; display: flex; flex-direction: column; align-items: center; background-color: #f0f2f5; margin-top: 50px; }
|
|
.container { display: flex; gap: 20px; }
|
|
.video-box { background: #000; border-radius: 10px; overflow: hidden; width: 320px; height: 240px; position: relative; }
|
|
video { width: 100%; height: 100%; object-fit: cover; }
|
|
.avatar-box { width: 320px; height: 240px; background: #fff; border-radius: 10px; display: flex; align-items: center; justify-content: center; box-shadow: 0 4px 12px rgba(0,0,0,0.1); flex-direction: column; }
|
|
.avatar-box img { width: 100px; height: 100px; border-radius: 50%; margin-bottom: 10px; }
|
|
.controls { margin-top: 30px; }
|
|
button { padding: 15px 30px; font-size: 18px; border: none; border-radius: 25px; cursor: pointer; background-color: #007bff; color: white; transition: background 0.2s; }
|
|
button:active { background-color: #0056b3; }
|
|
button:disabled { background-color: #ccc; cursor: not-allowed; }
|
|
#status { margin-top: 15px; color: #555; font-weight: bold; }
|
|
#transcript { margin-top: 20px; width: 600px; text-align: center; color: #333; }
|
|
</style>
|
|
</head>
|
|
<body>
|
|
|
|
<h2>🤖 VLM</h2>
|
|
|
|
<div class="container">
|
|
<div class="video-box">
|
|
<video id="userVideo" autoplay muted playsinline></video>
|
|
</div>
|
|
|
|
<div class="avatar-box" id="avatarBox">
|
|
<img src="https://api.dicebear.com/7.x/bottts/svg?seed=Felix" alt="Avatar" id="avatarImg">
|
|
<span id="avatarStatus">等待互动...</span>
|
|
</div>
|
|
</div>
|
|
|
|
<canvas id="canvas" style="display:none;"></canvas>
|
|
|
|
<div class="controls">
|
|
<button id="talkBtn" disabled>连接中...</button>
|
|
</div>
|
|
|
|
<div id="status">正在准备摄像头和麦克风...</div>
|
|
<div id="transcript"></div>
|
|
|
|
<script>
|
|
const videoElement = document.getElementById('userVideo');
|
|
const canvasElement = document.getElementById('canvas');
|
|
const talkBtn = document.getElementById('talkBtn');
|
|
const statusText = document.getElementById('status');
|
|
const transcriptText = document.getElementById('transcript');
|
|
const avatarStatus = document.getElementById('avatarStatus');
|
|
|
|
let ws;
|
|
let mediaRecorder;
|
|
let audioChunks = [];
|
|
let isRecording = false;
|
|
|
|
// 1. 初始化摄像头和麦克风
|
|
async function initMedia() {
|
|
try {
|
|
const stream = await navigator.mediaDevices.getUserMedia({ video: true, audio: true });
|
|
videoElement.srcObject = stream;
|
|
|
|
// 设置音频录制器
|
|
mediaRecorder = new MediaRecorder(stream);
|
|
|
|
mediaRecorder.ondataavailable = event => {
|
|
if (event.data.size > 0) audioChunks.push(event.data);
|
|
};
|
|
|
|
mediaRecorder.onstop = async () => {
|
|
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' });
|
|
audioChunks = []; // 清空准备下一次录音
|
|
|
|
// 录音结束时,截取当前摄像头画面
|
|
const imageBase64 = captureFrame();
|
|
|
|
// 将音频转为 Base64
|
|
const audioBase64 = await blobToBase64(audioBlob);
|
|
|
|
// 发送给后端
|
|
sendToServer(audioBase64, imageBase64);
|
|
};
|
|
|
|
statusText.innerText = "设备已就绪,正在连接服务器...";
|
|
initWebSocket();
|
|
|
|
} catch (err) {
|
|
statusText.innerText = "获取摄像头/麦克风失败,请允许权限!";
|
|
console.error(err);
|
|
}
|
|
}
|
|
|
|
// 2. 初始化 WebSocket 连接
|
|
function initWebSocket() {
|
|
// 假设我们后端的 FastAPI 跑在 8000 端口
|
|
ws = new WebSocket('ws://localhost:8000/ws');
|
|
|
|
ws.onopen = () => {
|
|
statusText.innerText = "✅ 已连接到大脑!长按按钮说话。";
|
|
talkBtn.innerText = "按住说话 🎙️";
|
|
talkBtn.disabled = false;
|
|
};
|
|
|
|
ws.onmessage = async (event) => {
|
|
const response = JSON.parse(event.data);
|
|
|
|
if (response.type === 'text') {
|
|
// 后端流式返回的文字
|
|
transcriptText.innerHTML += response.content;
|
|
} else if (response.type === 'audio') {
|
|
// 后端返回的 TTS 音频 (Base64)
|
|
playAudio(response.content);
|
|
}
|
|
};
|
|
|
|
ws.onclose = () => {
|
|
statusText.innerText = "❌ 与服务器断开连接";
|
|
talkBtn.disabled = true;
|
|
};
|
|
}
|
|
|
|
// 3. 截取视频帧转为 Base64
|
|
function captureFrame() {
|
|
const context = canvasElement.getContext('2d');
|
|
// 为了减轻后端压力,我们缩小图片分辨率到 320x240
|
|
canvasElement.width = 320;
|
|
canvasElement.height = 240;
|
|
context.drawImage(videoElement, 0, 0, 320, 240);
|
|
return canvasElement.toDataURL('image/jpeg', 0.8); // 压缩为 JPG
|
|
}
|
|
|
|
// 4. 将 Blob 转为 Base64 的辅助函数
|
|
function blobToBase64(blob) {
|
|
return new Promise((resolve, _) => {
|
|
const reader = new FileReader();
|
|
reader.onloadend = () => resolve(reader.result);
|
|
reader.readAsDataURL(blob);
|
|
});
|
|
}
|
|
|
|
// 5. 将图文数据发送给后端
|
|
function sendToServer(audioB64, imageB64) {
|
|
statusText.innerText = "🧠 正在思考...";
|
|
avatarStatus.innerText = "思考中 🤔";
|
|
transcriptText.innerHTML = "<strong>AI: </strong>"; // 准备显示 AI 的回复
|
|
|
|
const payload = {
|
|
type: "user_input",
|
|
audio: audioB64,
|
|
image: imageB64
|
|
};
|
|
ws.send(JSON.stringify(payload));
|
|
}
|
|
|
|
// 6. 播放后端返回的音频
|
|
function playAudio(base64Audio) {
|
|
avatarStatus.innerText = "说话中 🗣️";
|
|
const audio = new Audio(base64Audio);
|
|
audio.play();
|
|
audio.onended = () => {
|
|
avatarStatus.innerText = "等待互动...";
|
|
statusText.innerText = "✅ 播放完毕,可继续对话。";
|
|
};
|
|
}
|
|
|
|
// 7. 绑定按钮事件 (鼠标按下/松开 模拟对讲机)
|
|
talkBtn.addEventListener('mousedown', () => {
|
|
if (!isRecording) {
|
|
mediaRecorder.start();
|
|
isRecording = true;
|
|
talkBtn.innerText = "松开发送 ⬆️";
|
|
statusText.innerText = "🎙️ 正在录音...并准备抓取画面...";
|
|
}
|
|
});
|
|
|
|
talkBtn.addEventListener('mouseup', () => {
|
|
if (isRecording) {
|
|
mediaRecorder.stop();
|
|
isRecording = false;
|
|
talkBtn.innerText = "按住说话 🎙️";
|
|
}
|
|
});
|
|
|
|
// 启动应用
|
|
initMedia();
|
|
</script>
|
|
</body>
|
|
</html> |