From 00502f0fe7c1b49f1ed1482d4c6e4368bda7b9ef Mon Sep 17 00:00:00 2001 From: qixinbo Date: Sun, 29 Mar 2026 17:24:26 +0800 Subject: [PATCH] fix: kb add pdf support --- backend/app/api/knowledge.py | 22 ++++++++++++++++++++-- backend/pyproject.toml | 1 + backend/uv.lock | 11 +++++++++++ 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/backend/app/api/knowledge.py b/backend/app/api/knowledge.py index 0e763b5..3ca75d7 100644 --- a/backend/app/api/knowledge.py +++ b/backend/app/api/knowledge.py @@ -50,6 +50,23 @@ def _extract_upload_text(filename: str, content: bytes) -> str: if lower.endswith((".xls", ".xlsx")): df = pd.read_excel(io.BytesIO(content)) return df.to_csv(index=False) + + # 增加对 PDF 的文本提取支持 + if lower.endswith(".pdf"): + try: + import PyPDF2 + pdf_reader = PyPDF2.PdfReader(io.BytesIO(content)) + text = [] + for page in pdf_reader.pages: + page_text = page.extract_text() + if page_text: + text.append(page_text) + return "\n".join(text) + except ImportError: + raise ValueError("PyPDF2 is not installed. Cannot parse PDF files.") + except Exception as e: + raise ValueError(f"Failed to parse PDF: {str(e)}") + raise ValueError("Unsupported file type") @@ -238,8 +255,9 @@ async def upload_knowledge_documents( content = await file.read() if not content: continue - if len(content) > 5 * 1024 * 1024: - raise HTTPException(status_code=400, detail=f"文件过大: {filename}") + # 将大小限制从 5MB 放宽到 15MB,以更好地支持带有图片的 PDF 文件 + if len(content) > 15 * 1024 * 1024: + raise HTTPException(status_code=400, detail=f"文件过大 (超过 15MB): {filename}") try: text = _extract_upload_text(filename, content) except Exception: diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 1be3119..8529ade 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -31,6 +31,7 @@ dependencies = [ "psycopg2-binary>=2.9.11", "pydantic>=2.12.0,<3.0.0", "pydantic-settings>=2.12.0,<3.0.0", + "pypdf2>=3.0.0", "python-dotenv>=1.0.1", "python-jose[cryptography]>=3.5.0", "python-multipart>=0.0.22", diff --git a/backend/uv.lock b/backend/uv.lock index f54b250..e5962be 100644 --- a/backend/uv.lock +++ b/backend/uv.lock @@ -245,6 +245,7 @@ dependencies = [ { name = "psycopg2-binary" }, { name = "pydantic" }, { name = "pydantic-settings" }, + { name = "pypdf2" }, { name = "python-dotenv" }, { name = "python-jose", extra = ["cryptography"] }, { name = "python-multipart" }, @@ -293,6 +294,7 @@ requires-dist = [ { name = "psycopg2-binary", specifier = ">=2.9.11" }, { name = "pydantic", specifier = ">=2.12.0,<3.0.0" }, { name = "pydantic-settings", specifier = ">=2.12.0,<3.0.0" }, + { name = "pypdf2", specifier = ">=3.0.0" }, { name = "python-dotenv", specifier = ">=1.0.1" }, { name = "python-jose", extras = ["cryptography"], specifier = ">=3.5.0" }, { name = "python-multipart", specifier = ">=0.0.22" }, @@ -2660,6 +2662,15 @@ crypto = [ { name = "cryptography" }, ] +[[package]] +name = "pypdf2" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0"