feat(llm): implement structured JSON output for assistant responses

2025-10-26 17:01:59 +08:00
parent 33063821df
commit 2297ba097b
3 changed files with 242 additions and 55 deletions
--- a/api/chat_service.py
+++ b/api/chat_service.py
@ -1,4 +1,5 @@
-from typing import Dict, Any, Optional
+from typing import Dict, Any, Optional, Tuple
+import json
 import base64
 import threading
 from datetime import datetime
@ -31,6 +32,37 @@ class ChatService:
        logger.info(f"Initializing Mem0 integration for user: {self.user_id}")
        self._initialized = True
    
+    def _parse_assistant_output(self, raw_text: Any) -> Tuple[str, str, Optional[str]]:
+        """Parse model JSON output into reply/action while reporting parse issues."""
+        if raw_text is None:
+            return "", "无", "empty_response"
+
+        if not isinstance(raw_text, str):
+            raw_text = str(raw_text)
+
+        stripped = raw_text.strip()
+        if not stripped:
+            return "", "无", "empty_response"
+
+        try:
+            payload = json.loads(stripped)
+        except json.JSONDecodeError as exc:
+            logger.warning("Failed to parse assistant JSON response: %s", exc)
+            return stripped, "无", f"json_decode_error: {exc}"
+        except Exception as exc:
+            logger.warning("Unexpected error parsing assistant response: %s", exc)
+            return stripped, "无", f"parse_error: {exc}"
+
+        reply = str(payload.get("reply", "")).strip()
+        action = str(payload.get("action", "")).strip()
+
+        if not reply:
+            reply = stripped
+        if not action:
+            action = "无"
+
+        return reply, action, None
+
    def chat(self, user_input: str, include_audio: bool = True) -> Dict[str, Any]:
        """处理用户输入并返回回复（包含音频）"""
        if not self._initialized:
@ -51,13 +83,20 @@ class ChatService:
                }
            
            assistant_response = result["response"]
+            reply, action, parse_error = self._parse_assistant_output(assistant_response)
+            if parse_error:
+                logger.warning("Assistant output parse_error=%s", parse_error)
+
+            response_text = reply.strip() if isinstance(reply, str) else str(reply)
+            if not response_text:
+                response_text = assistant_response.strip() if isinstance(assistant_response, str) else str(assistant_response)
            
            # Step 2: Generate audio if requested
            audio_data = None
            audio_error = None
-            if include_audio:
+            if include_audio and response_text:
                try:
-                    success, message, base64_audio = text_to_speech(assistant_response, self.user_id)
+                    success, message, base64_audio = text_to_speech(response_text, self.user_id)
                    if success and base64_audio:
                        audio_data = base64_audio
                    else:
@ -68,7 +107,9 @@ class ChatService:
            # Step 3: Prepare response
            response_data = {
                "success": True,
-                "response": assistant_response,
+                "response": response_text,
+                "action": action,
+                "parse_error": parse_error,
                "user_id": self.user_id
            }
            
--- a/api/main.py
+++ b/api/main.py
@ -32,6 +32,8 @@ class ChatRequest(BaseModel):
 class ChatResponse(BaseModel):
    success: bool
    response: Optional[str] = None
+    action: Optional[str] = None
+    parse_error: Optional[str] = None
    tokens: Optional[int] = None
    user_id: str
    error: Optional[str] = None