diff --git a/.gitignore b/.gitignore
index c0ff608..9d87cce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+data
_research
.trae
.DS_Store
diff --git a/README.md b/README.md
index 2dca8e4..c1d08c6 100644
--- a/README.md
+++ b/README.md
@@ -44,6 +44,7 @@ DataClaw 的架构主要分为三只“大钳子”:
1. **`frontend/`** 🎨: 闪亮的外壳。基于 **React 19**、**Vite**、**TailwindCSS** 和 **Zustand** 构建。拥有类似微信/ChatGPT的对话界面、支持流式思考过程渲染以及交互式图表展示。
2. **`backend/`** ⚙️: 强健的肌肉。一个 **FastAPI** 后端服务,负责管理项目、数据源连接、用户会话持久化以及作为 API 网关。
3. **`nanobot/`** 🧠: 智慧的大脑。核心的 AI Agent 框架,负责处理意图路由、NL2SQL 转换、Schema 缓存管理以及与 LLM 的底层交互。
+4. **`data/`** 🗄️: 运行时数据目录。与代码目录解耦,存放上传文件、会话、技能工作区、报告与配置缓存。
***
@@ -68,6 +69,14 @@ pip install -r requirements.txt
uvicorn app.main:app --reload --port 8000
```
+可选环境变量:
+
+```bash
+export DATA_ROOT=/absolute/path/to/data
+```
+
+若未设置,默认使用仓库根目录下的 `data/`。
+
*提示:请确保* *`nanobot`* *核心库已根据项目工作区的要求正确链接或以可编辑模式 (editable mode) 安装。*
### 2. 前端服务启动 ⚛️
@@ -150,4 +159,3 @@ DataClaw 的开发深受以下优秀开源项目的启发,特此致谢:
- [Aix-DB](https://github.com/apconw/Aix-DB): 在智能数据分析和交互式体验方面提供了极好的参考。
-
diff --git a/README_en.md b/README_en.md
index a457153..27abea5 100644
--- a/README_en.md
+++ b/README_en.md
@@ -44,6 +44,7 @@ DataClaw is divided into three main claws (components):
1. **`frontend/`** 🎨: The shiny shell. Built with **React 19**, **Vite**, **TailwindCSS**, and **Zustand**. It features a chat-like interface, streaming AI responses, and interactive Vega charts.
2. **`backend/`** ⚙️: The muscle. A **FastAPI** application managing projects, data source connections, user sessions, and API gateways.
3. **`nanobot/`** 🧠: The brain. The core AI agent framework handling NL2SQL, schema caching, prompt injection, and LLM routing.
+4. **`data/`** 🗄️: Runtime data root. Decoupled from code directories and used for uploads, sessions, workspace skills, reports, and cached configs.
***
@@ -68,6 +69,14 @@ pip install -r requirements.txt
uvicorn app.main:app --reload --port 8000
```
+Optional environment variable:
+
+```bash
+export DATA_ROOT=/absolute/path/to/data
+```
+
+If not set, DataClaw uses the repository-level `data/` directory by default.
+
*Note: Ensure your* *`nanobot`* *is properly linked or installed in editable mode as per the project workspace.*
### 2. Frontend Setup ⚛️
@@ -149,4 +158,3 @@ The development of DataClaw was deeply inspired by the following excellent open-
- [Aix-DB](https://github.com/apconw/Aix-DB): Provided an excellent reference for intelligent data analysis and interactive user experience.
-
diff --git a/backend/app/api/llm.py b/backend/app/api/llm.py
index 039f8a4..f7ca43b 100644
--- a/backend/app/api/llm.py
+++ b/backend/app/api/llm.py
@@ -6,12 +6,13 @@ from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from jose import jwt, JWTError
from pydantic import BaseModel, Field
from app.core.security import SECRET_KEY, ALGORITHM
+from app.core.data_root import get_data_root
from litellm import completion
router = APIRouter()
security = HTTPBearer()
-DATA_FILE = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "llm_config.json")
+DATA_FILE = str(get_data_root() / "llm_config.json")
class CurrentUser(BaseModel):
diff --git a/backend/app/api/skills.py b/backend/app/api/skills.py
index 1cc8bed..aecc452 100644
--- a/backend/app/api/skills.py
+++ b/backend/app/api/skills.py
@@ -10,14 +10,15 @@ from datetime import datetime
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
from pydantic import BaseModel, Field
+from app.core.data_root import get_data_root, get_workspace_root
+
router = APIRouter()
-BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
-DATA_FILE = os.path.join(BASE_DIR, "data", "skills.json")
-SKILL_HUB_DIR = os.path.join(BASE_DIR, "data", "workspace", "skills")
+DATA_FILE = str(get_data_root() / "skills.json")
+SKILL_HUB_DIR = str(get_workspace_root() / "skills")
-# Ensure skill-hub directory exists
-os.makedirs(SKILL_HUB_DIR, exist_ok=True)
+def _ensure_skill_hub_dir() -> None:
+ os.makedirs(SKILL_HUB_DIR, exist_ok=True)
class Skill(BaseModel):
id: str = Field(..., description="Unique identifier for the skill")
@@ -134,6 +135,7 @@ def _write_skill_markdown(skill_dir: str, skill_name: str, description: Optional
return skill_md_path
def load_skills(project_id: Optional[int] = None) -> List[Dict[str, Any]]:
+ _ensure_skill_hub_dir()
data = _load_data()
registered_paths = set()
@@ -208,6 +210,7 @@ async def upload_skill(
"""Upload a skill file (SKILL.md) or a packaged skill (zip/tar.gz)."""
filename = file.filename
print(f"Uploading skill: {filename}, project_id: {project_id}")
+ _ensure_skill_hub_dir()
# Create a unique temp directory
temp_dir_name = f"temp_{datetime.now().timestamp()}_{os.urandom(4).hex()}"
@@ -323,6 +326,7 @@ async def upload_skill(
@router.post("/skills", response_model=Skill)
def create_skill(skill: SkillCreate):
+ _ensure_skill_hub_dir()
data = load_skills()
if any(item["id"] == skill.id for item in data):
raise HTTPException(status_code=400, detail="Skill with this ID already exists")
diff --git a/backend/app/api/upload.py b/backend/app/api/upload.py
index 1f4a1ed..94af496 100644
--- a/backend/app/api/upload.py
+++ b/backend/app/api/upload.py
@@ -3,14 +3,15 @@ import pandas as pd
import duckdb
import io
import uuid
-from pathlib import Path
+
+from app.core.data_root import get_uploads_root
router = APIRouter()
-upload_dir = Path(__file__).resolve().parents[2] / "data" / "uploads"
-upload_dir.mkdir(parents=True, exist_ok=True)
+upload_dir = get_uploads_root()
@router.post("/upload/file")
async def upload_file(file: UploadFile = File(...)):
+ upload_dir.mkdir(parents=True, exist_ok=True)
allowed_extensions = ('.csv', '.xls', '.xlsx', '.parquet', '.db', '.sqlite', '.sqlite3')
filename_lower = file.filename.lower()
if not filename_lower.endswith(allowed_extensions):
diff --git a/backend/app/core/artifacts.py b/backend/app/core/artifacts.py
index fc071fb..6829baa 100644
--- a/backend/app/core/artifacts.py
+++ b/backend/app/core/artifacts.py
@@ -6,6 +6,7 @@ from urllib.parse import quote
from pydantic import BaseModel
+from app.core.data_root import get_data_root, get_reports_root, get_uploads_root, get_workspace_root
LOCAL_URI_PATTERN = re.compile(r"local://[^\s<>'\"\]\)\}]+")
PATH_PATTERN = re.compile(
@@ -138,11 +139,11 @@ def _normalize_locator(raw_locator: str) -> str:
def _resolve_locator(locator: str) -> Path | None:
- backend_root = Path(__file__).resolve().parents[2]
- data_root = backend_root / "data"
- workspace_root = data_root / "workspace"
- uploads_root = data_root / "uploads"
- reports_root = data_root / "data"
+ data_root = get_data_root()
+ workspace_root = get_workspace_root()
+ uploads_root = get_uploads_root()
+ reports_root = get_reports_root()
+ repo_root = data_root.parent
if locator.startswith("local://"):
raw_local = locator.replace("local://", "", 1).strip().lstrip("/\\")
if not raw_local:
@@ -160,11 +161,11 @@ def _resolve_locator(locator: str) -> Path | None:
if path.is_absolute():
return path
if normalized.startswith("data/data/"):
- return backend_root / normalized
+ return repo_root / normalized
checks = [
workspace_root / normalized,
data_root / normalized,
- backend_root / normalized,
+ repo_root / normalized,
]
for candidate in checks:
if candidate.exists():
diff --git a/backend/app/core/data_root.py b/backend/app/core/data_root.py
new file mode 100644
index 0000000..ffa8957
--- /dev/null
+++ b/backend/app/core/data_root.py
@@ -0,0 +1,39 @@
+import os
+from pathlib import Path
+
+
+BACKEND_ROOT = Path(__file__).resolve().parents[2]
+REPO_ROOT = BACKEND_ROOT.parent
+DEFAULT_DATA_ROOT = REPO_ROOT / "data"
+LEGACY_DATA_ROOT = BACKEND_ROOT / "data"
+
+
+def get_data_root() -> Path:
+ configured = (os.getenv("DATA_ROOT") or "").strip()
+ if configured:
+ return Path(configured).expanduser().resolve()
+ if DEFAULT_DATA_ROOT.exists():
+ return DEFAULT_DATA_ROOT
+ if LEGACY_DATA_ROOT.exists():
+ print(f"[DATA_ROOT] legacy path detected: {LEGACY_DATA_ROOT}. Please migrate to {DEFAULT_DATA_ROOT}.")
+ return LEGACY_DATA_ROOT
+ return DEFAULT_DATA_ROOT
+
+
+def get_workspace_root() -> Path:
+ return get_data_root() / "workspace"
+
+
+def get_uploads_root() -> Path:
+ return get_data_root() / "uploads"
+
+
+def get_reports_root() -> Path:
+ return get_data_root() / "data"
+
+
+def ensure_data_layout() -> None:
+ get_data_root().mkdir(parents=True, exist_ok=True)
+ get_workspace_root().mkdir(parents=True, exist_ok=True)
+ get_uploads_root().mkdir(parents=True, exist_ok=True)
+ get_reports_root().mkdir(parents=True, exist_ok=True)
diff --git a/backend/app/core/files.py b/backend/app/core/files.py
index e228f29..3482d44 100644
--- a/backend/app/core/files.py
+++ b/backend/app/core/files.py
@@ -2,12 +2,13 @@ import os
from pathlib import Path
from typing import Optional
+from app.core.data_root import get_data_root, get_reports_root, get_uploads_root, get_workspace_root
-backend_root = Path(__file__).resolve().parents[2]
-data_root = backend_root / "data"
-workspace_root = data_root / "workspace"
-uploads_root = data_root / "uploads"
-reports_root = data_root / "data"
+
+data_root = get_data_root()
+workspace_root = get_workspace_root()
+uploads_root = get_uploads_root()
+reports_root = get_reports_root()
allowed_artifact_roots = (workspace_root, uploads_root, reports_root)
@@ -50,7 +51,7 @@ def resolve_artifact_target(target: str) -> Path | None:
if path.is_absolute():
return path
if normalized.startswith("data/data/"):
- return backend_root / normalized
+ return data_root.parent / normalized
checks = (
workspace_root / normalized,
data_root / normalized,
diff --git a/backend/app/core/nanobot.py b/backend/app/core/nanobot.py
index 04d4338..caa140b 100644
--- a/backend/app/core/nanobot.py
+++ b/backend/app/core/nanobot.py
@@ -34,6 +34,7 @@ from nanobot.config.schema import Config
from app.api.skills import load_skills
from app.services.llm_cache import get_llm_configs
+from app.core.data_root import get_workspace_root
from app.core.streaming_provider import StreamingLiteLLMProvider
class NanobotIntegration:
@@ -47,8 +48,7 @@ class NanobotIntegration:
self._model_agent_lock = asyncio.Lock()
def initialize(self):
- # Set workspace path to backend/data/workspace
- workspace_path = Path(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "workspace"))
+ workspace_path = get_workspace_root()
workspace_path.mkdir(parents=True, exist_ok=True)
self._sync_builtin_skills_to_workspace(workspace_path)
diff --git a/backend/app/core/session_alias_store.py b/backend/app/core/session_alias_store.py
index ea6b24f..42c5ef6 100644
--- a/backend/app/core/session_alias_store.py
+++ b/backend/app/core/session_alias_store.py
@@ -5,14 +5,21 @@ from datetime import datetime, timezone
from pathlib import Path
from typing import Any
+from app.core.data_root import get_data_root
+
class SessionAliasStore:
def __init__(self) -> None:
- backend_root = Path(__file__).resolve().parents[2]
- data_dir = backend_root / "data"
- data_dir.mkdir(parents=True, exist_ok=True)
+ data_dir = get_data_root()
+ try:
+ data_dir.mkdir(parents=True, exist_ok=True)
+ except PermissionError as exc:
+ raise RuntimeError(f"DATA_ROOT 权限不足: {data_dir}") from exc
self.db_path = data_dir / "nanobot_sessions.db"
- self._init_db()
+ try:
+ self._init_db()
+ except PermissionError as exc:
+ raise RuntimeError(f"DATA_ROOT 权限不足: {data_dir}") from exc
def _connect(self) -> sqlite3.Connection:
conn = sqlite3.connect(str(self.db_path))
diff --git a/backend/app/services/mdl.py b/backend/app/services/mdl.py
index 8151560..29b503b 100644
--- a/backend/app/services/mdl.py
+++ b/backend/app/services/mdl.py
@@ -6,9 +6,9 @@ from app.models.datasource import DataSource
from app.schemas.mdl import MDLManifest, Model, Column, TableReference
from app.connectors.factory import get_connector
from app.database import SessionLocal
+from app.core.data_root import get_data_root
-# Assuming running from backend/ directory
-MDL_STORAGE_PATH = Path("data/mdl")
+MDL_STORAGE_PATH = get_data_root() / "mdl"
class MDLService:
@staticmethod
diff --git a/backend/main.py b/backend/main.py
index 3112485..9eda992 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -20,6 +20,7 @@ from app.api import upload, llm, skills, users, datasources, projects, semantic
from app.connectors.postgres import postgres_connector
from app.connectors.clickhouse import clickhouse_connector
from app.core.artifacts import extract_artifacts
+from app.core.data_root import ensure_data_layout, get_data_root, get_reports_root
from app.core.files import ensure_artifact_access, resolve_artifact_target
from app.core.nanobot import nanobot_service
from app.core.session_alias_store import session_alias_store
@@ -44,9 +45,12 @@ app.add_middleware(
Base.metadata.create_all(bind=engine)
# Mount static directory for reports
-data_dir = os.path.join(os.path.dirname(__file__), "data", "data")
-os.makedirs(data_dir, exist_ok=True)
-app.mount("/reports", StaticFiles(directory=data_dir), name="reports")
+try:
+ ensure_data_layout()
+except Exception as e:
+ raise RuntimeError(f"DATA_ROOT 初始化失败: {e}") from e
+reports_dir = get_reports_root()
+app.mount("/reports", StaticFiles(directory=str(reports_dir)), name="reports")
app.include_router(upload.router, prefix="/api/v1")
app.include_router(llm.router, prefix="/api/v1")
@@ -71,6 +75,13 @@ PREVIEWABLE_TEXT_EXTENSIONS = {
@app.on_event("startup")
async def startup_event():
+ try:
+ data_root = get_data_root()
+ data_root.mkdir(parents=True, exist_ok=True)
+ if not os.access(data_root, os.R_OK | os.W_OK | os.X_OK):
+ raise RuntimeError(f"DATA_ROOT 权限不足: {data_root}")
+ except Exception as e:
+ raise RuntimeError(f"DATA_ROOT 初始化失败: {e}") from e
# Initialize nanobot in background
try:
await nanobot_service.start()
diff --git a/backend/tests/test_artifact_endpoints.py b/backend/tests/test_artifact_endpoints.py
index a35c970..862f42a 100644
--- a/backend/tests/test_artifact_endpoints.py
+++ b/backend/tests/test_artifact_endpoints.py
@@ -3,11 +3,12 @@ from pathlib import Path
import pytest
from fastapi.testclient import TestClient
+from app.core.data_root import get_data_root
from main import app
def _backend_data_root() -> Path:
- return Path(__file__).resolve().parents[1] / "data"
+ return get_data_root()
def test_download_artifact_within_whitelist() -> None:
diff --git a/backend/tests/test_artifacts.py b/backend/tests/test_artifacts.py
index 29e792e..4a3519c 100644
--- a/backend/tests/test_artifacts.py
+++ b/backend/tests/test_artifacts.py
@@ -1,10 +1,11 @@
from pathlib import Path
from app.core.artifacts import extract_artifacts
+from app.core.data_root import get_data_root
def _backend_data_root() -> Path:
- return Path(__file__).resolve().parents[1] / "data"
+ return get_data_root()
def test_extract_artifacts_from_local_and_tool_paths() -> None:
diff --git a/backend/tests/test_data_root.py b/backend/tests/test_data_root.py
new file mode 100644
index 0000000..038fa40
--- /dev/null
+++ b/backend/tests/test_data_root.py
@@ -0,0 +1,28 @@
+from pathlib import Path
+
+from app.core import data_root
+
+
+def test_data_root_prefers_env(monkeypatch, tmp_path: Path) -> None:
+ custom = tmp_path / "custom-data-root"
+ monkeypatch.setenv("DATA_ROOT", str(custom))
+ assert data_root.get_data_root() == custom.resolve()
+
+
+def test_data_root_falls_back_to_legacy(monkeypatch, tmp_path: Path) -> None:
+ monkeypatch.delenv("DATA_ROOT", raising=False)
+ legacy = tmp_path / "legacy-data"
+ default = tmp_path / "default-data"
+ legacy.mkdir(parents=True, exist_ok=True)
+ monkeypatch.setattr(data_root, "LEGACY_DATA_ROOT", legacy)
+ monkeypatch.setattr(data_root, "DEFAULT_DATA_ROOT", default)
+ assert data_root.get_data_root() == legacy
+
+
+def test_ensure_data_layout_creates_children(monkeypatch, tmp_path: Path) -> None:
+ monkeypatch.setenv("DATA_ROOT", str(tmp_path / "dr"))
+ data_root.ensure_data_layout()
+ root = data_root.get_data_root()
+ assert (root / "workspace").exists()
+ assert (root / "uploads").exists()
+ assert (root / "data").exists()
diff --git a/backend/test_nl2sql.py b/backend/tests/test_nl2sql.py
similarity index 100%
rename from backend/test_nl2sql.py
rename to backend/tests/test_nl2sql.py
diff --git a/frontend/src/components/ChatInterface.tsx b/frontend/src/components/ChatInterface.tsx
index f5e409a..0c53bea 100644
--- a/frontend/src/components/ChatInterface.tsx
+++ b/frontend/src/components/ChatInterface.tsx
@@ -1114,8 +1114,20 @@ export function ChatInterface() {