diff --git a/.gitignore b/.gitignore index c0ff608..9d87cce 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +data _research .trae .DS_Store diff --git a/README.md b/README.md index 2dca8e4..c1d08c6 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ DataClaw 的架构主要分为三只“大钳子”: 1. **`frontend/`** 🎨: 闪亮的外壳。基于 **React 19**、**Vite**、**TailwindCSS** 和 **Zustand** 构建。拥有类似微信/ChatGPT的对话界面、支持流式思考过程渲染以及交互式图表展示。 2. **`backend/`** ⚙️: 强健的肌肉。一个 **FastAPI** 后端服务,负责管理项目、数据源连接、用户会话持久化以及作为 API 网关。 3. **`nanobot/`** 🧠: 智慧的大脑。核心的 AI Agent 框架,负责处理意图路由、NL2SQL 转换、Schema 缓存管理以及与 LLM 的底层交互。 +4. **`data/`** 🗄️: 运行时数据目录。与代码目录解耦,存放上传文件、会话、技能工作区、报告与配置缓存。 *** @@ -68,6 +69,14 @@ pip install -r requirements.txt uvicorn app.main:app --reload --port 8000 ``` +可选环境变量: + +```bash +export DATA_ROOT=/absolute/path/to/data +``` + +若未设置,默认使用仓库根目录下的 `data/`。 + *提示:请确保* *`nanobot`* *核心库已根据项目工作区的要求正确链接或以可编辑模式 (editable mode) 安装。* ### 2. 前端服务启动 ⚛️ @@ -150,4 +159,3 @@ DataClaw 的开发深受以下优秀开源项目的启发,特此致谢: - [Aix-DB](https://github.com/apconw/Aix-DB): 在智能数据分析和交互式体验方面提供了极好的参考。
- diff --git a/README_en.md b/README_en.md index a457153..27abea5 100644 --- a/README_en.md +++ b/README_en.md @@ -44,6 +44,7 @@ DataClaw is divided into three main claws (components): 1. **`frontend/`** 🎨: The shiny shell. Built with **React 19**, **Vite**, **TailwindCSS**, and **Zustand**. It features a chat-like interface, streaming AI responses, and interactive Vega charts. 2. **`backend/`** ⚙️: The muscle. A **FastAPI** application managing projects, data source connections, user sessions, and API gateways. 3. **`nanobot/`** 🧠: The brain. The core AI agent framework handling NL2SQL, schema caching, prompt injection, and LLM routing. +4. **`data/`** 🗄️: Runtime data root. Decoupled from code directories and used for uploads, sessions, workspace skills, reports, and cached configs. *** @@ -68,6 +69,14 @@ pip install -r requirements.txt uvicorn app.main:app --reload --port 8000 ``` +Optional environment variable: + +```bash +export DATA_ROOT=/absolute/path/to/data +``` + +If not set, DataClaw uses the repository-level `data/` directory by default. + *Note: Ensure your* *`nanobot`* *is properly linked or installed in editable mode as per the project workspace.* ### 2. Frontend Setup ⚛️ @@ -149,4 +158,3 @@ The development of DataClaw was deeply inspired by the following excellent open- - [Aix-DB](https://github.com/apconw/Aix-DB): Provided an excellent reference for intelligent data analysis and interactive user experience.
- diff --git a/backend/app/api/llm.py b/backend/app/api/llm.py index 039f8a4..f7ca43b 100644 --- a/backend/app/api/llm.py +++ b/backend/app/api/llm.py @@ -6,12 +6,13 @@ from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer from jose import jwt, JWTError from pydantic import BaseModel, Field from app.core.security import SECRET_KEY, ALGORITHM +from app.core.data_root import get_data_root from litellm import completion router = APIRouter() security = HTTPBearer() -DATA_FILE = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "llm_config.json") +DATA_FILE = str(get_data_root() / "llm_config.json") class CurrentUser(BaseModel): diff --git a/backend/app/api/skills.py b/backend/app/api/skills.py index 1cc8bed..aecc452 100644 --- a/backend/app/api/skills.py +++ b/backend/app/api/skills.py @@ -10,14 +10,15 @@ from datetime import datetime from fastapi import APIRouter, HTTPException, UploadFile, File, Form from pydantic import BaseModel, Field +from app.core.data_root import get_data_root, get_workspace_root + router = APIRouter() -BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) -DATA_FILE = os.path.join(BASE_DIR, "data", "skills.json") -SKILL_HUB_DIR = os.path.join(BASE_DIR, "data", "workspace", "skills") +DATA_FILE = str(get_data_root() / "skills.json") +SKILL_HUB_DIR = str(get_workspace_root() / "skills") -# Ensure skill-hub directory exists -os.makedirs(SKILL_HUB_DIR, exist_ok=True) +def _ensure_skill_hub_dir() -> None: + os.makedirs(SKILL_HUB_DIR, exist_ok=True) class Skill(BaseModel): id: str = Field(..., description="Unique identifier for the skill") @@ -134,6 +135,7 @@ def _write_skill_markdown(skill_dir: str, skill_name: str, description: Optional return skill_md_path def load_skills(project_id: Optional[int] = None) -> List[Dict[str, Any]]: + _ensure_skill_hub_dir() data = _load_data() registered_paths = set() @@ -208,6 +210,7 @@ async def upload_skill( """Upload a skill file (SKILL.md) or a packaged skill (zip/tar.gz).""" filename = file.filename print(f"Uploading skill: {filename}, project_id: {project_id}") + _ensure_skill_hub_dir() # Create a unique temp directory temp_dir_name = f"temp_{datetime.now().timestamp()}_{os.urandom(4).hex()}" @@ -323,6 +326,7 @@ async def upload_skill( @router.post("/skills", response_model=Skill) def create_skill(skill: SkillCreate): + _ensure_skill_hub_dir() data = load_skills() if any(item["id"] == skill.id for item in data): raise HTTPException(status_code=400, detail="Skill with this ID already exists") diff --git a/backend/app/api/upload.py b/backend/app/api/upload.py index 1f4a1ed..94af496 100644 --- a/backend/app/api/upload.py +++ b/backend/app/api/upload.py @@ -3,14 +3,15 @@ import pandas as pd import duckdb import io import uuid -from pathlib import Path + +from app.core.data_root import get_uploads_root router = APIRouter() -upload_dir = Path(__file__).resolve().parents[2] / "data" / "uploads" -upload_dir.mkdir(parents=True, exist_ok=True) +upload_dir = get_uploads_root() @router.post("/upload/file") async def upload_file(file: UploadFile = File(...)): + upload_dir.mkdir(parents=True, exist_ok=True) allowed_extensions = ('.csv', '.xls', '.xlsx', '.parquet', '.db', '.sqlite', '.sqlite3') filename_lower = file.filename.lower() if not filename_lower.endswith(allowed_extensions): diff --git a/backend/app/core/artifacts.py b/backend/app/core/artifacts.py index fc071fb..6829baa 100644 --- a/backend/app/core/artifacts.py +++ b/backend/app/core/artifacts.py @@ -6,6 +6,7 @@ from urllib.parse import quote from pydantic import BaseModel +from app.core.data_root import get_data_root, get_reports_root, get_uploads_root, get_workspace_root LOCAL_URI_PATTERN = re.compile(r"local://[^\s<>'\"\]\)\}]+") PATH_PATTERN = re.compile( @@ -138,11 +139,11 @@ def _normalize_locator(raw_locator: str) -> str: def _resolve_locator(locator: str) -> Path | None: - backend_root = Path(__file__).resolve().parents[2] - data_root = backend_root / "data" - workspace_root = data_root / "workspace" - uploads_root = data_root / "uploads" - reports_root = data_root / "data" + data_root = get_data_root() + workspace_root = get_workspace_root() + uploads_root = get_uploads_root() + reports_root = get_reports_root() + repo_root = data_root.parent if locator.startswith("local://"): raw_local = locator.replace("local://", "", 1).strip().lstrip("/\\") if not raw_local: @@ -160,11 +161,11 @@ def _resolve_locator(locator: str) -> Path | None: if path.is_absolute(): return path if normalized.startswith("data/data/"): - return backend_root / normalized + return repo_root / normalized checks = [ workspace_root / normalized, data_root / normalized, - backend_root / normalized, + repo_root / normalized, ] for candidate in checks: if candidate.exists(): diff --git a/backend/app/core/data_root.py b/backend/app/core/data_root.py new file mode 100644 index 0000000..ffa8957 --- /dev/null +++ b/backend/app/core/data_root.py @@ -0,0 +1,39 @@ +import os +from pathlib import Path + + +BACKEND_ROOT = Path(__file__).resolve().parents[2] +REPO_ROOT = BACKEND_ROOT.parent +DEFAULT_DATA_ROOT = REPO_ROOT / "data" +LEGACY_DATA_ROOT = BACKEND_ROOT / "data" + + +def get_data_root() -> Path: + configured = (os.getenv("DATA_ROOT") or "").strip() + if configured: + return Path(configured).expanduser().resolve() + if DEFAULT_DATA_ROOT.exists(): + return DEFAULT_DATA_ROOT + if LEGACY_DATA_ROOT.exists(): + print(f"[DATA_ROOT] legacy path detected: {LEGACY_DATA_ROOT}. Please migrate to {DEFAULT_DATA_ROOT}.") + return LEGACY_DATA_ROOT + return DEFAULT_DATA_ROOT + + +def get_workspace_root() -> Path: + return get_data_root() / "workspace" + + +def get_uploads_root() -> Path: + return get_data_root() / "uploads" + + +def get_reports_root() -> Path: + return get_data_root() / "data" + + +def ensure_data_layout() -> None: + get_data_root().mkdir(parents=True, exist_ok=True) + get_workspace_root().mkdir(parents=True, exist_ok=True) + get_uploads_root().mkdir(parents=True, exist_ok=True) + get_reports_root().mkdir(parents=True, exist_ok=True) diff --git a/backend/app/core/files.py b/backend/app/core/files.py index e228f29..3482d44 100644 --- a/backend/app/core/files.py +++ b/backend/app/core/files.py @@ -2,12 +2,13 @@ import os from pathlib import Path from typing import Optional +from app.core.data_root import get_data_root, get_reports_root, get_uploads_root, get_workspace_root -backend_root = Path(__file__).resolve().parents[2] -data_root = backend_root / "data" -workspace_root = data_root / "workspace" -uploads_root = data_root / "uploads" -reports_root = data_root / "data" + +data_root = get_data_root() +workspace_root = get_workspace_root() +uploads_root = get_uploads_root() +reports_root = get_reports_root() allowed_artifact_roots = (workspace_root, uploads_root, reports_root) @@ -50,7 +51,7 @@ def resolve_artifact_target(target: str) -> Path | None: if path.is_absolute(): return path if normalized.startswith("data/data/"): - return backend_root / normalized + return data_root.parent / normalized checks = ( workspace_root / normalized, data_root / normalized, diff --git a/backend/app/core/nanobot.py b/backend/app/core/nanobot.py index 04d4338..caa140b 100644 --- a/backend/app/core/nanobot.py +++ b/backend/app/core/nanobot.py @@ -34,6 +34,7 @@ from nanobot.config.schema import Config from app.api.skills import load_skills from app.services.llm_cache import get_llm_configs +from app.core.data_root import get_workspace_root from app.core.streaming_provider import StreamingLiteLLMProvider class NanobotIntegration: @@ -47,8 +48,7 @@ class NanobotIntegration: self._model_agent_lock = asyncio.Lock() def initialize(self): - # Set workspace path to backend/data/workspace - workspace_path = Path(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "workspace")) + workspace_path = get_workspace_root() workspace_path.mkdir(parents=True, exist_ok=True) self._sync_builtin_skills_to_workspace(workspace_path) diff --git a/backend/app/core/session_alias_store.py b/backend/app/core/session_alias_store.py index ea6b24f..42c5ef6 100644 --- a/backend/app/core/session_alias_store.py +++ b/backend/app/core/session_alias_store.py @@ -5,14 +5,21 @@ from datetime import datetime, timezone from pathlib import Path from typing import Any +from app.core.data_root import get_data_root + class SessionAliasStore: def __init__(self) -> None: - backend_root = Path(__file__).resolve().parents[2] - data_dir = backend_root / "data" - data_dir.mkdir(parents=True, exist_ok=True) + data_dir = get_data_root() + try: + data_dir.mkdir(parents=True, exist_ok=True) + except PermissionError as exc: + raise RuntimeError(f"DATA_ROOT 权限不足: {data_dir}") from exc self.db_path = data_dir / "nanobot_sessions.db" - self._init_db() + try: + self._init_db() + except PermissionError as exc: + raise RuntimeError(f"DATA_ROOT 权限不足: {data_dir}") from exc def _connect(self) -> sqlite3.Connection: conn = sqlite3.connect(str(self.db_path)) diff --git a/backend/app/services/mdl.py b/backend/app/services/mdl.py index 8151560..29b503b 100644 --- a/backend/app/services/mdl.py +++ b/backend/app/services/mdl.py @@ -6,9 +6,9 @@ from app.models.datasource import DataSource from app.schemas.mdl import MDLManifest, Model, Column, TableReference from app.connectors.factory import get_connector from app.database import SessionLocal +from app.core.data_root import get_data_root -# Assuming running from backend/ directory -MDL_STORAGE_PATH = Path("data/mdl") +MDL_STORAGE_PATH = get_data_root() / "mdl" class MDLService: @staticmethod diff --git a/backend/main.py b/backend/main.py index 3112485..9eda992 100644 --- a/backend/main.py +++ b/backend/main.py @@ -20,6 +20,7 @@ from app.api import upload, llm, skills, users, datasources, projects, semantic from app.connectors.postgres import postgres_connector from app.connectors.clickhouse import clickhouse_connector from app.core.artifacts import extract_artifacts +from app.core.data_root import ensure_data_layout, get_data_root, get_reports_root from app.core.files import ensure_artifact_access, resolve_artifact_target from app.core.nanobot import nanobot_service from app.core.session_alias_store import session_alias_store @@ -44,9 +45,12 @@ app.add_middleware( Base.metadata.create_all(bind=engine) # Mount static directory for reports -data_dir = os.path.join(os.path.dirname(__file__), "data", "data") -os.makedirs(data_dir, exist_ok=True) -app.mount("/reports", StaticFiles(directory=data_dir), name="reports") +try: + ensure_data_layout() +except Exception as e: + raise RuntimeError(f"DATA_ROOT 初始化失败: {e}") from e +reports_dir = get_reports_root() +app.mount("/reports", StaticFiles(directory=str(reports_dir)), name="reports") app.include_router(upload.router, prefix="/api/v1") app.include_router(llm.router, prefix="/api/v1") @@ -71,6 +75,13 @@ PREVIEWABLE_TEXT_EXTENSIONS = { @app.on_event("startup") async def startup_event(): + try: + data_root = get_data_root() + data_root.mkdir(parents=True, exist_ok=True) + if not os.access(data_root, os.R_OK | os.W_OK | os.X_OK): + raise RuntimeError(f"DATA_ROOT 权限不足: {data_root}") + except Exception as e: + raise RuntimeError(f"DATA_ROOT 初始化失败: {e}") from e # Initialize nanobot in background try: await nanobot_service.start() diff --git a/backend/tests/test_artifact_endpoints.py b/backend/tests/test_artifact_endpoints.py index a35c970..862f42a 100644 --- a/backend/tests/test_artifact_endpoints.py +++ b/backend/tests/test_artifact_endpoints.py @@ -3,11 +3,12 @@ from pathlib import Path import pytest from fastapi.testclient import TestClient +from app.core.data_root import get_data_root from main import app def _backend_data_root() -> Path: - return Path(__file__).resolve().parents[1] / "data" + return get_data_root() def test_download_artifact_within_whitelist() -> None: diff --git a/backend/tests/test_artifacts.py b/backend/tests/test_artifacts.py index 29e792e..4a3519c 100644 --- a/backend/tests/test_artifacts.py +++ b/backend/tests/test_artifacts.py @@ -1,10 +1,11 @@ from pathlib import Path from app.core.artifacts import extract_artifacts +from app.core.data_root import get_data_root def _backend_data_root() -> Path: - return Path(__file__).resolve().parents[1] / "data" + return get_data_root() def test_extract_artifacts_from_local_and_tool_paths() -> None: diff --git a/backend/tests/test_data_root.py b/backend/tests/test_data_root.py new file mode 100644 index 0000000..038fa40 --- /dev/null +++ b/backend/tests/test_data_root.py @@ -0,0 +1,28 @@ +from pathlib import Path + +from app.core import data_root + + +def test_data_root_prefers_env(monkeypatch, tmp_path: Path) -> None: + custom = tmp_path / "custom-data-root" + monkeypatch.setenv("DATA_ROOT", str(custom)) + assert data_root.get_data_root() == custom.resolve() + + +def test_data_root_falls_back_to_legacy(monkeypatch, tmp_path: Path) -> None: + monkeypatch.delenv("DATA_ROOT", raising=False) + legacy = tmp_path / "legacy-data" + default = tmp_path / "default-data" + legacy.mkdir(parents=True, exist_ok=True) + monkeypatch.setattr(data_root, "LEGACY_DATA_ROOT", legacy) + monkeypatch.setattr(data_root, "DEFAULT_DATA_ROOT", default) + assert data_root.get_data_root() == legacy + + +def test_ensure_data_layout_creates_children(monkeypatch, tmp_path: Path) -> None: + monkeypatch.setenv("DATA_ROOT", str(tmp_path / "dr")) + data_root.ensure_data_layout() + root = data_root.get_data_root() + assert (root / "workspace").exists() + assert (root / "uploads").exists() + assert (root / "data").exists() diff --git a/backend/test_nl2sql.py b/backend/tests/test_nl2sql.py similarity index 100% rename from backend/test_nl2sql.py rename to backend/tests/test_nl2sql.py diff --git a/frontend/src/components/ChatInterface.tsx b/frontend/src/components/ChatInterface.tsx index f5e409a..0c53bea 100644 --- a/frontend/src/components/ChatInterface.tsx +++ b/frontend/src/components/ChatInterface.tsx @@ -1114,8 +1114,20 @@ export function ChatInterface() {