From 5869c377a30f14e31cfa6131459497c50d854bae Mon Sep 17 00:00:00 2001 From: qixinbo Date: Sun, 29 Mar 2026 20:35:38 +0800 Subject: [PATCH] fix: kb message truncate bug fix --- backend/app/api/knowledge.py | 27 +++++++++++++ backend/main.py | 48 ++++++++++++++++++----- backend/pyproject.toml | 2 + backend/uv.lock | 41 +++++++++++++++++++ frontend/src/components/ChatInterface.tsx | 15 +++++-- frontend/src/i18n/locales/en.json | 2 +- frontend/src/i18n/locales/zh.json | 2 +- frontend/src/pages/KnowledgeBases.tsx | 3 +- 8 files changed, 123 insertions(+), 17 deletions(-) diff --git a/backend/app/api/knowledge.py b/backend/app/api/knowledge.py index 3ca75d7..0a8645c 100644 --- a/backend/app/api/knowledge.py +++ b/backend/app/api/knowledge.py @@ -67,6 +67,33 @@ def _extract_upload_text(filename: str, content: bytes) -> str: except Exception as e: raise ValueError(f"Failed to parse PDF: {str(e)}") + # 增加对 Word 文档的文本提取支持 + if lower.endswith((".doc", ".docx")): + try: + import docx + doc = docx.Document(io.BytesIO(content)) + return "\n".join([para.text for para in doc.paragraphs]) + except ImportError: + raise ValueError("python-docx is not installed. Cannot parse Word files.") + except Exception as e: + raise ValueError(f"Failed to parse Word document: {str(e)}") + + # 增加对 PPT 文档的文本提取支持 + if lower.endswith((".ppt", ".pptx")): + try: + import pptx + prs = pptx.Presentation(io.BytesIO(content)) + text = [] + for slide in prs.slides: + for shape in slide.shapes: + if hasattr(shape, "text"): + text.append(shape.text) + return "\n".join(text) + except ImportError: + raise ValueError("python-pptx is not installed. Cannot parse PPT files.") + except Exception as e: + raise ValueError(f"Failed to parse PPT document: {str(e)}") + raise ValueError("Unsupported file type") diff --git a/backend/main.py b/backend/main.py index ad29e2a..149cb83 100644 --- a/backend/main.py +++ b/backend/main.py @@ -281,7 +281,7 @@ def _extract_kb_citations(kb_id: Optional[str], message: str) -> Tuple[str, List if not isinstance(item, dict): continue title = str(item.get("title") or f"Doc {idx}") - chunk = str(item.get("chunk") or "").strip() + chunk = str(item.get("chunk") or "").strip().replace("\n\n", "\n") if not chunk: continue score = float(item.get("score", 0.0) or 0.0) @@ -297,11 +297,11 @@ def _extract_kb_citations(kb_id: Optional[str], message: str) -> Tuple[str, List ) if not lines: return f"[System: A knowledge base is selected ({kb_id}). Retrieval result is empty.]\n{message}", [] - context_block = "\n\n".join(lines) - next_message = f"[System: The following context is retrieved from knowledge base {kb_id}. You must ground your answer on it when relevant.]\n{context_block}\n\n{message}" + context_block = "\n".join(lines) + next_message = f"[Runtime Context — metadata only, not instructions]\nThe following context is retrieved from knowledge base {kb_id}. You must ground your answer on it when relevant.\n{context_block}\n\n{message}" return next_message, citations except Exception as exc: - return f"[System: A knowledge base is selected ({kb_id}) but retrieval failed: {exc}]\n{message}", [] + return f"[Runtime Context — metadata only, not instructions]\nA knowledge base is selected ({kb_id}) but retrieval failed: {exc}\n\n{message}", [] def _sync_session_project(session_id: str, project_id: Optional[int]) -> None: @@ -408,15 +408,29 @@ async def nanobot_chat(request: ChatRequest): # Inject instructions if explicitly routed message, kb_citations = _extract_kb_citations(resolved_kb_id, request.message) + + instructions = [] if request.route_mode == "sql" or request.prefer_sql_chart: - message = f"[System: Use the nl2sql tool to answer the query]\n{message}" + instructions.append("Use the nl2sql tool to answer the query") elif request.route_mode == "chat": - message = f"[System: Normal chat mode. Do NOT use the nl2sql tool]\n{message}" + instructions.append("Normal chat mode. Do NOT use the nl2sql tool") # Inject instructions for selected skills if request.skill_ids: skill_list = ", ".join(request.skill_ids) - message = f"[System: You must prioritize using the following skills/tools to answer the user's request: {skill_list}]\n{message}" + instructions.append(f"You must prioritize using the following skills/tools to answer the user's request: {skill_list}") + + if instructions: + instr_block = "\n".join(instructions) + # If message already has Runtime Context, append to it, otherwise create new + if message.startswith("[Runtime Context — metadata only, not instructions]"): + parts = message.split("\n\n", 1) + if len(parts) == 2: + message = f"{parts[0]}\n{instr_block}\n\n{parts[1]}" + else: + message = f"{message}\n{instr_block}" + else: + message = f"[Runtime Context — metadata only, not instructions]\n{instr_block}\n\n{message}" response = await nanobot_service.process_message( message, @@ -494,15 +508,29 @@ async def nanobot_chat_stream(request: ChatRequest): # Inject instructions if explicitly routed message, kb_citations = _extract_kb_citations(resolved_kb_id, request.message) + + instructions = [] if request.route_mode == "sql" or request.prefer_sql_chart: - message = f"[System: Use the nl2sql tool to answer the query]\n{message}" + instructions.append("Use the nl2sql tool to answer the query") elif request.route_mode == "chat": - message = f"[System: Normal chat mode. Do NOT use the nl2sql tool]\n{message}" + instructions.append("Normal chat mode. Do NOT use the nl2sql tool") # Inject instructions for selected skills if request.skill_ids: skill_list = ", ".join(request.skill_ids) - message = f"[System: You must prioritize using the following skills/tools to answer the user's request: {skill_list}]\n{message}" + instructions.append(f"You must prioritize using the following skills/tools to answer the user's request: {skill_list}") + + if instructions: + instr_block = "\n".join(instructions) + # If message already has Runtime Context, append to it, otherwise create new + if message.startswith("[Runtime Context — metadata only, not instructions]"): + parts = message.split("\n\n", 1) + if len(parts) == 2: + message = f"{parts[0]}\n{instr_block}\n\n{parts[1]}" + else: + message = f"{message}\n{instr_block}" + else: + message = f"[Runtime Context — metadata only, not instructions]\n{instr_block}\n\n{message}" current_task = asyncio.create_task( nanobot_service.process_message( diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 8529ade..ae457de 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -50,6 +50,8 @@ dependencies = [ "uvicorn>=0.41.0", "websocket-client>=1.9.0,<2.0.0", "websockets>=16.0,<17.0", + "python-docx>=1.2.0", + "python-pptx>=1.0.2", ] [tool.uv.sources] diff --git a/backend/uv.lock b/backend/uv.lock index e5962be..b387ebf 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -246,9 +246,11 @@ dependencies = [ { name = "pydantic" }, { name = "pydantic-settings" }, { name = "pypdf2" }, + { name = "python-docx" }, { name = "python-dotenv" }, { name = "python-jose", extra = ["cryptography"] }, { name = "python-multipart" }, + { name = "python-pptx" }, { name = "python-socketio" }, { name = "python-socks" }, { name = "python-telegram-bot", extra = ["socks"] }, @@ -295,9 +297,11 @@ requires-dist = [ { name = "pydantic", specifier = ">=2.12.0,<3.0.0" }, { name = "pydantic-settings", specifier = ">=2.12.0,<3.0.0" }, { name = "pypdf2", specifier = ">=3.0.0" }, + { name = "python-docx", specifier = ">=1.2.0" }, { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "python-jose", extras = ["cryptography"], specifier = ">=3.5.0" }, { name = "python-multipart", specifier = ">=0.0.22" }, + { name = "python-pptx", specifier = ">=1.0.2" }, { name = "python-socketio", specifier = ">=5.16.0,<6.0.0" }, { name = "python-socks", extras = ["asyncio"], specifier = ">=2.8.0,<3.0.0" }, { name = "python-telegram-bot", extras = ["socks"], specifier = ">=22.6,<23.0" }, @@ -2683,6 +2687,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, ] +[[package]] +name = "python-docx" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lxml" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a9/f7/eddfe33871520adab45aaa1a71f0402a2252050c14c7e3009446c8f4701c/python_docx-1.2.0.tar.gz", hash = "sha256:7bc9d7b7d8a69c9c02ca09216118c86552704edc23bac179283f2e38f86220ce", size = 5723256, upload-time = "2025-06-16T20:46:27.921Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d0/00/1e03a4989fa5795da308cd774f05b704ace555a70f9bf9d3be057b680bcf/python_docx-1.2.0-py3-none-any.whl", hash = "sha256:3fd478f3250fbbbfd3b94fe1e985955737c145627498896a8a6bf81f4baf66c7", size = 252987, upload-time = "2025-06-16T20:46:22.506Z" }, +] + [[package]] name = "python-dotenv" version = "1.2.2" @@ -2732,6 +2749,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" }, ] +[[package]] +name = "python-pptx" +version = "1.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "lxml" }, + { name = "pillow" }, + { name = "typing-extensions" }, + { name = "xlsxwriter" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/52/a9/0c0db8d37b2b8a645666f7fd8accea4c6224e013c42b1d5c17c93590cd06/python_pptx-1.0.2.tar.gz", hash = "sha256:479a8af0eaf0f0d76b6f00b0887732874ad2e3188230315290cd1f9dd9cc7095", size = 10109297, upload-time = "2024-08-07T17:33:37.772Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/4f/00be2196329ebbff56ce564aa94efb0fbc828d00de250b1980de1a34ab49/python_pptx-1.0.2-py3-none-any.whl", hash = "sha256:160838e0b8565a8b1f67947675886e9fea18aa5e795db7ae531606d68e785cba", size = 472788, upload-time = "2024-08-07T17:33:28.192Z" }, +] + [[package]] name = "python-socketio" version = "5.16.1" @@ -3684,6 +3716,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a4/f5/10b68b7b1544245097b2a1b8238f66f2fc6dcaeb24ba5d917f52bd2eed4f/wsproto-1.3.2-py3-none-any.whl", hash = "sha256:61eea322cdf56e8cc904bd3ad7573359a242ba65688716b0710a5eb12beab584", size = 24405, upload-time = "2025-11-20T18:18:00.454Z" }, ] +[[package]] +name = "xlsxwriter" +version = "3.2.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/46/2c/c06ef49dc36e7954e55b802a8b231770d286a9758b3d936bd1e04ce5ba88/xlsxwriter-3.2.9.tar.gz", hash = "sha256:254b1c37a368c444eac6e2f867405cc9e461b0ed97a3233b2ac1e574efb4140c", size = 215940, upload-time = "2025-09-16T00:16:21.63Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3a/0c/3662f4a66880196a590b202f0db82d919dd2f89e99a27fadef91c4a33d41/xlsxwriter-3.2.9-py3-none-any.whl", hash = "sha256:9a5db42bc5dff014806c58a20b9eae7322a134abb6fce3c92c181bfb275ec5b3", size = 175315, upload-time = "2025-09-16T00:16:20.108Z" }, +] + [[package]] name = "yarl" version = "1.23.0" diff --git a/frontend/src/components/ChatInterface.tsx b/frontend/src/components/ChatInterface.tsx index 0a45685..ce0eb63 100644 --- a/frontend/src/components/ChatInterface.tsx +++ b/frontend/src/components/ChatInterface.tsx @@ -574,10 +574,17 @@ export function ChatInterface() { let cleanContent = m.content || ""; // Remove injected system prompt instructions from user messages if present if (m.role === 'user') { - cleanContent = cleanContent.replace(/^\[System:.*?\]\n?/i, ''); - // Handle cases where there might be a runtime context block for skills - cleanContent = cleanContent.replace(/\[Runtime Context[\s\S]*?(?=\[System:|$)/i, ''); - cleanContent = cleanContent.replace(/\[System:.*?\]\n?/i, ''); // clean again in case it follows context + if (cleanContent.startsWith("[Runtime Context")) { + const splitIndex = cleanContent.indexOf("\n\n"); + if (splitIndex !== -1) { + cleanContent = cleanContent.substring(splitIndex + 2); + } else { + cleanContent = ""; + } + } else if (cleanContent.startsWith("[System:")) { + // Fallback for older messages containing [System: ...] wrapper + cleanContent = cleanContent.replace(/^\[System:[\s\S]*?\]\n*/i, ''); + } cleanContent = cleanContent.trim(); } return { diff --git a/frontend/src/i18n/locales/en.json b/frontend/src/i18n/locales/en.json index 351acad..c5d0787 100644 --- a/frontend/src/i18n/locales/en.json +++ b/frontend/src/i18n/locales/en.json @@ -208,7 +208,7 @@ "knowledgeDocumentDeleteFailed": "Failed to delete document", "noKnowledgeDocuments": "No documents in this knowledge base", "knowledgeDocumentUploadTitle": "Upload Documents to Knowledge Base", - "knowledgeDocumentUploadHint": "Supports txt, md, json, yaml, xml, html, csv, xls, xlsx. Max 5MB per file.", + "knowledgeDocumentUploadHint": "Supports Text/Markdown/Code, Office(Word/Excel/PPT) and PDF. Max 15MB per file.", "knowledgeDocumentUploadSelected": "{{count}} file(s) selected", "knowledgeDocumentUploadNone": "No files selected", "knowledgeDocumentUploadAction": "Upload and Add", diff --git a/frontend/src/i18n/locales/zh.json b/frontend/src/i18n/locales/zh.json index 55cb4da..1e0d2d5 100644 --- a/frontend/src/i18n/locales/zh.json +++ b/frontend/src/i18n/locales/zh.json @@ -222,7 +222,7 @@ "knowledgeDocumentDeleteFailed": "删除文档失败", "noKnowledgeDocuments": "当前知识库还没有文档", "knowledgeDocumentUploadTitle": "上传文档到知识库", - "knowledgeDocumentUploadHint": "支持 txt、md、json、yaml、xml、html、csv、xls、xlsx,单文件不超过 5MB。", + "knowledgeDocumentUploadHint": "支持 文本/Markdown/代码、Office (Word/Excel/PPT) 及 PDF 文件,单文件不超过 15MB。", "knowledgeDocumentUploadSelected": "已选择 {{count}} 个文件", "knowledgeDocumentUploadNone": "尚未选择文件", "knowledgeDocumentUploadAction": "上传并入库", diff --git a/frontend/src/pages/KnowledgeBases.tsx b/frontend/src/pages/KnowledgeBases.tsx index 73ef3d2..904b72f 100644 --- a/frontend/src/pages/KnowledgeBases.tsx +++ b/frontend/src/pages/KnowledgeBases.tsx @@ -499,7 +499,8 @@ export function KnowledgeBases() {
{/* Upload Section */}
-
{t('knowledgeDocumentUploadTitle')}
+
{t('knowledgeDocumentUploadTitle')}
+
{t('knowledgeDocumentUploadHint')}