From 5869c377a30f14e31cfa6131459497c50d854bae Mon Sep 17 00:00:00 2001
From: qixinbo <qixinbo@gmail.com>
Date: Sun, 29 Mar 2026 20:35:38 +0800
Subject: [PATCH] fix: kb message truncate bug fix

---
 backend/app/api/knowledge.py              | 27 +++++++++++++
 backend/main.py                           | 48 ++++++++++++++++++-----
 backend/pyproject.toml                    |  2 +
 backend/uv.lock                           | 41 +++++++++++++++++++
 frontend/src/components/ChatInterface.tsx | 15 +++++--
 frontend/src/i18n/locales/en.json         |  2 +-
 frontend/src/i18n/locales/zh.json         |  2 +-
 frontend/src/pages/KnowledgeBases.tsx     |  3 +-
 8 files changed, 123 insertions(+), 17 deletions(-)

diff --git a/backend/app/api/knowledge.py b/backend/app/api/knowledge.py
index 3ca75d7..0a8645c 100644
--- a/backend/app/api/knowledge.py
+++ b/backend/app/api/knowledge.py
@@ -67,6 +67,33 @@ def _extract_upload_text(filename: str, content: bytes) -> str:
         except Exception as e:
             raise ValueError(f"Failed to parse PDF: {str(e)}")
             
+    # 增加对 Word 文档的文本提取支持
+    if lower.endswith((".doc", ".docx")):
+        try:
+            import docx
+            doc = docx.Document(io.BytesIO(content))
+            return "\n".join([para.text for para in doc.paragraphs])
+        except ImportError:
+            raise ValueError("python-docx is not installed. Cannot parse Word files.")
+        except Exception as e:
+            raise ValueError(f"Failed to parse Word document: {str(e)}")
+            
+    # 增加对 PPT 文档的文本提取支持
+    if lower.endswith((".ppt", ".pptx")):
+        try:
+            import pptx
+            prs = pptx.Presentation(io.BytesIO(content))
+            text = []
+            for slide in prs.slides:
+                for shape in slide.shapes:
+                    if hasattr(shape, "text"):
+                        text.append(shape.text)
+            return "\n".join(text)
+        except ImportError:
+            raise ValueError("python-pptx is not installed. Cannot parse PPT files.")
+        except Exception as e:
+            raise ValueError(f"Failed to parse PPT document: {str(e)}")
+            
     raise ValueError("Unsupported file type")
 
 
diff --git a/backend/main.py b/backend/main.py
index ad29e2a..149cb83 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -281,7 +281,7 @@ def _extract_kb_citations(kb_id: Optional[str], message: str) -> Tuple[str, List
             if not isinstance(item, dict):
                 continue
             title = str(item.get("title") or f"Doc {idx}")
-            chunk = str(item.get("chunk") or "").strip()
+            chunk = str(item.get("chunk") or "").strip().replace("\n\n", "\n")
             if not chunk:
                 continue
             score = float(item.get("score", 0.0) or 0.0)
@@ -297,11 +297,11 @@ def _extract_kb_citations(kb_id: Optional[str], message: str) -> Tuple[str, List
             )
         if not lines:
             return f"[System: A knowledge base is selected ({kb_id}). Retrieval result is empty.]\n{message}", []
-        context_block = "\n\n".join(lines)
-        next_message = f"[System: The following context is retrieved from knowledge base {kb_id}. You must ground your answer on it when relevant.]\n{context_block}\n\n{message}"
+        context_block = "\n".join(lines)
+        next_message = f"[Runtime Context — metadata only, not instructions]\nThe following context is retrieved from knowledge base {kb_id}. You must ground your answer on it when relevant.\n{context_block}\n\n{message}"
         return next_message, citations
     except Exception as exc:
-        return f"[System: A knowledge base is selected ({kb_id}) but retrieval failed: {exc}]\n{message}", []
+        return f"[Runtime Context — metadata only, not instructions]\nA knowledge base is selected ({kb_id}) but retrieval failed: {exc}\n\n{message}", []
 
 
 def _sync_session_project(session_id: str, project_id: Optional[int]) -> None:
@@ -408,15 +408,29 @@ async def nanobot_chat(request: ChatRequest):
 
         # Inject instructions if explicitly routed
         message, kb_citations = _extract_kb_citations(resolved_kb_id, request.message)
+        
+        instructions = []
         if request.route_mode == "sql" or request.prefer_sql_chart:
-            message = f"[System: Use the nl2sql tool to answer the query]\n{message}"
+            instructions.append("Use the nl2sql tool to answer the query")
         elif request.route_mode == "chat":
-            message = f"[System: Normal chat mode. Do NOT use the nl2sql tool]\n{message}"
+            instructions.append("Normal chat mode. Do NOT use the nl2sql tool")
 
         # Inject instructions for selected skills
         if request.skill_ids:
             skill_list = ", ".join(request.skill_ids)
-            message = f"[System: You must prioritize using the following skills/tools to answer the user's request: {skill_list}]\n{message}"
+            instructions.append(f"You must prioritize using the following skills/tools to answer the user's request: {skill_list}")
+            
+        if instructions:
+            instr_block = "\n".join(instructions)
+            # If message already has Runtime Context, append to it, otherwise create new
+            if message.startswith("[Runtime Context — metadata only, not instructions]"):
+                parts = message.split("\n\n", 1)
+                if len(parts) == 2:
+                    message = f"{parts[0]}\n{instr_block}\n\n{parts[1]}"
+                else:
+                    message = f"{message}\n{instr_block}"
+            else:
+                message = f"[Runtime Context — metadata only, not instructions]\n{instr_block}\n\n{message}"
 
         response = await nanobot_service.process_message(
             message,
@@ -494,15 +508,29 @@ async def nanobot_chat_stream(request: ChatRequest):
 
             # Inject instructions if explicitly routed
             message, kb_citations = _extract_kb_citations(resolved_kb_id, request.message)
+            
+            instructions = []
             if request.route_mode == "sql" or request.prefer_sql_chart:
-                message = f"[System: Use the nl2sql tool to answer the query]\n{message}"
+                instructions.append("Use the nl2sql tool to answer the query")
             elif request.route_mode == "chat":
-                message = f"[System: Normal chat mode. Do NOT use the nl2sql tool]\n{message}"
+                instructions.append("Normal chat mode. Do NOT use the nl2sql tool")
 
             # Inject instructions for selected skills
             if request.skill_ids:
                 skill_list = ", ".join(request.skill_ids)
-                message = f"[System: You must prioritize using the following skills/tools to answer the user's request: {skill_list}]\n{message}"
+                instructions.append(f"You must prioritize using the following skills/tools to answer the user's request: {skill_list}")
+                
+            if instructions:
+                instr_block = "\n".join(instructions)
+                # If message already has Runtime Context, append to it, otherwise create new
+                if message.startswith("[Runtime Context — metadata only, not instructions]"):
+                    parts = message.split("\n\n", 1)
+                    if len(parts) == 2:
+                        message = f"{parts[0]}\n{instr_block}\n\n{parts[1]}"
+                    else:
+                        message = f"{message}\n{instr_block}"
+                else:
+                    message = f"[Runtime Context — metadata only, not instructions]\n{instr_block}\n\n{message}"
 
             current_task = asyncio.create_task(
                 nanobot_service.process_message(
diff --git a/backend/pyproject.toml b/backend/pyproject.toml
index 8529ade..ae457de 100644
--- a/backend/pyproject.toml
+++ b/backend/pyproject.toml
@@ -50,6 +50,8 @@ dependencies = [
     "uvicorn>=0.41.0",
     "websocket-client>=1.9.0,<2.0.0",
     "websockets>=16.0,<17.0",
+    "python-docx>=1.2.0",
+    "python-pptx>=1.0.2",
 ]
 
 [tool.uv.sources]
diff --git a/backend/uv.lock b/backend/uv.lock
index e5962be..b387ebf 100644
--- a/backend/uv.lock
+++ b/backend/uv.lock
@@ -246,9 +246,11 @@ dependencies = [
     { name = "pydantic" },
     { name = "pydantic-settings" },
     { name = "pypdf2" },
+    { name = "python-docx" },
     { name = "python-dotenv" },
     { name = "python-jose", extra = ["cryptography"] },
     { name = "python-multipart" },
+    { name = "python-pptx" },
     { name = "python-socketio" },
     { name = "python-socks" },
     { name = "python-telegram-bot", extra = ["socks"] },
@@ -295,9 +297,11 @@ requires-dist = [
     { name = "pydantic", specifier = ">=2.12.0,<3.0.0" },
     { name = "pydantic-settings", specifier = ">=2.12.0,<3.0.0" },
     { name = "pypdf2", specifier = ">=3.0.0" },
+    { name = "python-docx", specifier = ">=1.2.0" },
     { name = "python-dotenv", specifier = ">=1.0.1" },
     { name = "python-jose", extras = ["cryptography"], specifier = ">=3.5.0" },
     { name = "python-multipart", specifier = ">=0.0.22" },
+    { name = "python-pptx", specifier = ">=1.0.2" },
     { name = "python-socketio", specifier = ">=5.16.0,<6.0.0" },
     { name = "python-socks", extras = ["asyncio"], specifier = ">=2.8.0,<3.0.0" },
     { name = "python-telegram-bot", extras = ["socks"], specifier = ">=22.6,<23.0" },
@@ -2683,6 +2687,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
 ]
 
+[[package]]
+name = "python-docx"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "lxml" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a9/f7/eddfe33871520adab45aaa1a71f0402a2252050c14c7e3009446c8f4701c/python_docx-1.2.0.tar.gz", hash = "sha256:7bc9d7b7d8a69c9c02ca09216118c86552704edc23bac179283f2e38f86220ce", size = 5723256, upload-time = "2025-06-16T20:46:27.921Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" },
+]
+
 [[package]]
 name = "python-dotenv"
 version = "1.2.2"
@@ -2732,6 +2749,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" },
 ]
 
+[[package]]
+name = "python-pptx"
+version = "1.0.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "lxml" },
+    { name = "pillow" },
+    { name = "typing-extensions" },
+    { name = "xlsxwriter" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/52/a9/0c0db8d37b2b8a645666f7fd8accea4c6224e013c42b1d5c17c93590cd06/python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095", size = 10109297, upload-time = "2024-08-07T17:33:37.772Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" },
+]
+
 [[package]]
 name = "python-socketio"
 version = "5.16.1"
@@ -3684,6 +3716,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a4/f5/10b68b7b1544245097b2a1b8238f66f2fc6dcaeb24ba5d917f52bd2eed4f/wsproto-1.3.2-py3-none-any.whl", hash = "sha256:61eea322cdf56e8cc904bd3ad7573359a242ba65688716b0710a5eb12beab584", size = 24405, upload-time = "2025-11-20T18:18:00.454Z" },
 ]
 
+[[package]]
+name = "xlsxwriter"
+version = "3.2.9"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/46/2c/c06ef49dc36e7954e55b802a8b231770d286a9758b3d936bd1e04ce5ba88/xlsxwriter-3.2.9.tar.gz", hash = "sha256:254b1c37a368c444eac6e2f867405cc9e461b0ed97a3233b2ac1e574efb4140c", size = 215940, upload-time = "2025-09-16T00:16:21.63Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/3a/0c/3662f4a66880196a590b202f0db82d919dd2f89e99a27fadef91c4a33d41/xlsxwriter-3.2.9-py3-none-any.whl", hash = "sha256:9a5db42bc5dff014806c58a20b9eae7322a134abb6fce3c92c181bfb275ec5b3", size = 175315, upload-time = "2025-09-16T00:16:20.108Z" },
+]
+
 [[package]]
 name = "yarl"
 version = "1.23.0"
diff --git a/frontend/src/components/ChatInterface.tsx b/frontend/src/components/ChatInterface.tsx
index 0a45685..ce0eb63 100644
--- a/frontend/src/components/ChatInterface.tsx
+++ b/frontend/src/components/ChatInterface.tsx
@@ -574,10 +574,17 @@ export function ChatInterface() {
               let cleanContent = m.content || "";
               // Remove injected system prompt instructions from user messages if present
               if (m.role === 'user') {
-                cleanContent = cleanContent.replace(/^\[System:.*?\]\n?/i, '');
-                // Handle cases where there might be a runtime context block for skills
-                cleanContent = cleanContent.replace(/\[Runtime Context[\s\S]*?(?=\[System:|$)/i, '');
-                cleanContent = cleanContent.replace(/\[System:.*?\]\n?/i, ''); // clean again in case it follows context
+                if (cleanContent.startsWith("[Runtime Context")) {
+                  const splitIndex = cleanContent.indexOf("\n\n");
+                  if (splitIndex !== -1) {
+                    cleanContent = cleanContent.substring(splitIndex + 2);
+                  } else {
+                    cleanContent = "";
+                  }
+                } else if (cleanContent.startsWith("[System:")) {
+                  // Fallback for older messages containing [System: ...] wrapper
+                  cleanContent = cleanContent.replace(/^\[System:[\s\S]*?\]\n*/i, '');
+                }
                 cleanContent = cleanContent.trim();
               }
               return {
diff --git a/frontend/src/i18n/locales/en.json b/frontend/src/i18n/locales/en.json
index 351acad..c5d0787 100644
--- a/frontend/src/i18n/locales/en.json
+++ b/frontend/src/i18n/locales/en.json
@@ -208,7 +208,7 @@
   "knowledgeDocumentDeleteFailed": "Failed to delete document",
   "noKnowledgeDocuments": "No documents in this knowledge base",
   "knowledgeDocumentUploadTitle": "Upload Documents to Knowledge Base",
-  "knowledgeDocumentUploadHint": "Supports txt, md, json, yaml, xml, html, csv, xls, xlsx. Max 5MB per file.",
+  "knowledgeDocumentUploadHint": "Supports Text/Markdown/Code, Office(Word/Excel/PPT) and PDF. Max 15MB per file.",
   "knowledgeDocumentUploadSelected": "{{count}} file(s) selected",
   "knowledgeDocumentUploadNone": "No files selected",
   "knowledgeDocumentUploadAction": "Upload and Add",
diff --git a/frontend/src/i18n/locales/zh.json b/frontend/src/i18n/locales/zh.json
index 55cb4da..1e0d2d5 100644
--- a/frontend/src/i18n/locales/zh.json
+++ b/frontend/src/i18n/locales/zh.json
@@ -222,7 +222,7 @@
   "knowledgeDocumentDeleteFailed": "删除文档失败",
   "noKnowledgeDocuments": "当前知识库还没有文档",
   "knowledgeDocumentUploadTitle": "上传文档到知识库",
-  "knowledgeDocumentUploadHint": "支持 txt、md、json、yaml、xml、html、csv、xls、xlsx，单文件不超过 5MB。",
+  "knowledgeDocumentUploadHint": "支持 文本/Markdown/代码、Office (Word/Excel/PPT) 及 PDF 文件，单文件不超过 15MB。",
   "knowledgeDocumentUploadSelected": "已选择 {{count}} 个文件",
   "knowledgeDocumentUploadNone": "尚未选择文件",
   "knowledgeDocumentUploadAction": "上传并入库",
diff --git a/frontend/src/pages/KnowledgeBases.tsx b/frontend/src/pages/KnowledgeBases.tsx
index 73ef3d2..904b72f 100644
--- a/frontend/src/pages/KnowledgeBases.tsx
+++ b/frontend/src/pages/KnowledgeBases.tsx
@@ -499,7 +499,8 @@ export function KnowledgeBases() {
           <div className="flex-1 overflow-y-auto p-6 space-y-6">
             {/* Upload Section */}
             <div className="rounded-lg border border-border p-4 bg-muted/30">
-              <div className="text-sm font-medium text-foreground mb-3">{t('knowledgeDocumentUploadTitle')}</div>
+              <div className="text-sm font-medium text-foreground mb-1">{t('knowledgeDocumentUploadTitle')}</div>
+              <div className="text-xs text-muted-foreground mb-3">{t('knowledgeDocumentUploadHint')}</div>
               <div className="flex items-center gap-3">
                 <Input
                   type="file"