fix: kb add pdf support
This commit is contained in:
@@ -50,6 +50,23 @@ def _extract_upload_text(filename: str, content: bytes) -> str:
|
|||||||
if lower.endswith((".xls", ".xlsx")):
|
if lower.endswith((".xls", ".xlsx")):
|
||||||
df = pd.read_excel(io.BytesIO(content))
|
df = pd.read_excel(io.BytesIO(content))
|
||||||
return df.to_csv(index=False)
|
return df.to_csv(index=False)
|
||||||
|
|
||||||
|
# 增加对 PDF 的文本提取支持
|
||||||
|
if lower.endswith(".pdf"):
|
||||||
|
try:
|
||||||
|
import PyPDF2
|
||||||
|
pdf_reader = PyPDF2.PdfReader(io.BytesIO(content))
|
||||||
|
text = []
|
||||||
|
for page in pdf_reader.pages:
|
||||||
|
page_text = page.extract_text()
|
||||||
|
if page_text:
|
||||||
|
text.append(page_text)
|
||||||
|
return "\n".join(text)
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError("PyPDF2 is not installed. Cannot parse PDF files.")
|
||||||
|
except Exception as e:
|
||||||
|
raise ValueError(f"Failed to parse PDF: {str(e)}")
|
||||||
|
|
||||||
raise ValueError("Unsupported file type")
|
raise ValueError("Unsupported file type")
|
||||||
|
|
||||||
|
|
||||||
@@ -238,8 +255,9 @@ async def upload_knowledge_documents(
|
|||||||
content = await file.read()
|
content = await file.read()
|
||||||
if not content:
|
if not content:
|
||||||
continue
|
continue
|
||||||
if len(content) > 5 * 1024 * 1024:
|
# 将大小限制从 5MB 放宽到 15MB,以更好地支持带有图片的 PDF 文件
|
||||||
raise HTTPException(status_code=400, detail=f"文件过大: {filename}")
|
if len(content) > 15 * 1024 * 1024:
|
||||||
|
raise HTTPException(status_code=400, detail=f"文件过大 (超过 15MB): {filename}")
|
||||||
try:
|
try:
|
||||||
text = _extract_upload_text(filename, content)
|
text = _extract_upload_text(filename, content)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ dependencies = [
|
|||||||
"psycopg2-binary>=2.9.11",
|
"psycopg2-binary>=2.9.11",
|
||||||
"pydantic>=2.12.0,<3.0.0",
|
"pydantic>=2.12.0,<3.0.0",
|
||||||
"pydantic-settings>=2.12.0,<3.0.0",
|
"pydantic-settings>=2.12.0,<3.0.0",
|
||||||
|
"pypdf2>=3.0.0",
|
||||||
"python-dotenv>=1.0.1",
|
"python-dotenv>=1.0.1",
|
||||||
"python-jose[cryptography]>=3.5.0",
|
"python-jose[cryptography]>=3.5.0",
|
||||||
"python-multipart>=0.0.22",
|
"python-multipart>=0.0.22",
|
||||||
|
|||||||
Generated
+11
@@ -245,6 +245,7 @@ dependencies = [
|
|||||||
{ name = "psycopg2-binary" },
|
{ name = "psycopg2-binary" },
|
||||||
{ name = "pydantic" },
|
{ name = "pydantic" },
|
||||||
{ name = "pydantic-settings" },
|
{ name = "pydantic-settings" },
|
||||||
|
{ name = "pypdf2" },
|
||||||
{ name = "python-dotenv" },
|
{ name = "python-dotenv" },
|
||||||
{ name = "python-jose", extra = ["cryptography"] },
|
{ name = "python-jose", extra = ["cryptography"] },
|
||||||
{ name = "python-multipart" },
|
{ name = "python-multipart" },
|
||||||
@@ -293,6 +294,7 @@ requires-dist = [
|
|||||||
{ name = "psycopg2-binary", specifier = ">=2.9.11" },
|
{ name = "psycopg2-binary", specifier = ">=2.9.11" },
|
||||||
{ name = "pydantic", specifier = ">=2.12.0,<3.0.0" },
|
{ name = "pydantic", specifier = ">=2.12.0,<3.0.0" },
|
||||||
{ name = "pydantic-settings", specifier = ">=2.12.0,<3.0.0" },
|
{ name = "pydantic-settings", specifier = ">=2.12.0,<3.0.0" },
|
||||||
|
{ name = "pypdf2", specifier = ">=3.0.0" },
|
||||||
{ name = "python-dotenv", specifier = ">=1.0.1" },
|
{ name = "python-dotenv", specifier = ">=1.0.1" },
|
||||||
{ name = "python-jose", extras = ["cryptography"], specifier = ">=3.5.0" },
|
{ name = "python-jose", extras = ["cryptography"], specifier = ">=3.5.0" },
|
||||||
{ name = "python-multipart", specifier = ">=0.0.22" },
|
{ name = "python-multipart", specifier = ">=0.0.22" },
|
||||||
@@ -2660,6 +2662,15 @@ crypto = [
|
|||||||
{ name = "cryptography" },
|
{ name = "cryptography" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pypdf2"
|
||||||
|
version = "3.0.1"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "python-dateutil"
|
name = "python-dateutil"
|
||||||
version = "2.9.0.post0"
|
version = "2.9.0.post0"
|
||||||
|
|||||||
Reference in New Issue
Block a user