feat: mv data folder to root

This commit is contained in:
qixinbo
2026-03-27 15:59:23 +08:00
parent 37070d7896
commit 5d479bed68
18 changed files with 175 additions and 39 deletions
+1
View File
@@ -1,3 +1,4 @@
data
_research _research
.trae .trae
.DS_Store .DS_Store
+9 -1
View File
@@ -44,6 +44,7 @@ DataClaw 的架构主要分为三只“大钳子”:
1. **`frontend/`** 🎨: 闪亮的外壳。基于 **React 19**、**Vite**、**TailwindCSS** 和 **Zustand** 构建。拥有类似微信/ChatGPT的对话界面、支持流式思考过程渲染以及交互式图表展示。 1. **`frontend/`** 🎨: 闪亮的外壳。基于 **React 19**、**Vite**、**TailwindCSS** 和 **Zustand** 构建。拥有类似微信/ChatGPT的对话界面、支持流式思考过程渲染以及交互式图表展示。
2. **`backend/`** ⚙️: 强健的肌肉。一个 **FastAPI** 后端服务,负责管理项目、数据源连接、用户会话持久化以及作为 API 网关。 2. **`backend/`** ⚙️: 强健的肌肉。一个 **FastAPI** 后端服务,负责管理项目、数据源连接、用户会话持久化以及作为 API 网关。
3. **`nanobot/`** 🧠: 智慧的大脑。核心的 AI Agent 框架,负责处理意图路由、NL2SQL 转换、Schema 缓存管理以及与 LLM 的底层交互。 3. **`nanobot/`** 🧠: 智慧的大脑。核心的 AI Agent 框架,负责处理意图路由、NL2SQL 转换、Schema 缓存管理以及与 LLM 的底层交互。
4. **`data/`** 🗄️: 运行时数据目录。与代码目录解耦,存放上传文件、会话、技能工作区、报告与配置缓存。
*** ***
@@ -68,6 +69,14 @@ pip install -r requirements.txt
uvicorn app.main:app --reload --port 8000 uvicorn app.main:app --reload --port 8000
``` ```
可选环境变量:
```bash
export DATA_ROOT=/absolute/path/to/data
```
若未设置,默认使用仓库根目录下的 `data/`
*提示:请确保* *`nanobot`* *核心库已根据项目工作区的要求正确链接或以可编辑模式 (editable mode) 安装。* *提示:请确保* *`nanobot`* *核心库已根据项目工作区的要求正确链接或以可编辑模式 (editable mode) 安装。*
### 2. 前端服务启动 ⚛️ ### 2. 前端服务启动 ⚛️
@@ -150,4 +159,3 @@ DataClaw 的开发深受以下优秀开源项目的启发,特此致谢:
- [Aix-DB](https://github.com/apconw/Aix-DB): 在智能数据分析和交互式体验方面提供了极好的参考。 - [Aix-DB](https://github.com/apconw/Aix-DB): 在智能数据分析和交互式体验方面提供了极好的参考。
<br /> <br />
+9 -1
View File
@@ -44,6 +44,7 @@ DataClaw is divided into three main claws (components):
1. **`frontend/`** 🎨: The shiny shell. Built with **React 19**, **Vite**, **TailwindCSS**, and **Zustand**. It features a chat-like interface, streaming AI responses, and interactive Vega charts. 1. **`frontend/`** 🎨: The shiny shell. Built with **React 19**, **Vite**, **TailwindCSS**, and **Zustand**. It features a chat-like interface, streaming AI responses, and interactive Vega charts.
2. **`backend/`** ⚙️: The muscle. A **FastAPI** application managing projects, data source connections, user sessions, and API gateways. 2. **`backend/`** ⚙️: The muscle. A **FastAPI** application managing projects, data source connections, user sessions, and API gateways.
3. **`nanobot/`** 🧠: The brain. The core AI agent framework handling NL2SQL, schema caching, prompt injection, and LLM routing. 3. **`nanobot/`** 🧠: The brain. The core AI agent framework handling NL2SQL, schema caching, prompt injection, and LLM routing.
4. **`data/`** 🗄️: Runtime data root. Decoupled from code directories and used for uploads, sessions, workspace skills, reports, and cached configs.
*** ***
@@ -68,6 +69,14 @@ pip install -r requirements.txt
uvicorn app.main:app --reload --port 8000 uvicorn app.main:app --reload --port 8000
``` ```
Optional environment variable:
```bash
export DATA_ROOT=/absolute/path/to/data
```
If not set, DataClaw uses the repository-level `data/` directory by default.
*Note: Ensure your* *`nanobot`* *is properly linked or installed in editable mode as per the project workspace.* *Note: Ensure your* *`nanobot`* *is properly linked or installed in editable mode as per the project workspace.*
### 2. Frontend Setup ⚛️ ### 2. Frontend Setup ⚛️
@@ -149,4 +158,3 @@ The development of DataClaw was deeply inspired by the following excellent open-
- [Aix-DB](https://github.com/apconw/Aix-DB): Provided an excellent reference for intelligent data analysis and interactive user experience. - [Aix-DB](https://github.com/apconw/Aix-DB): Provided an excellent reference for intelligent data analysis and interactive user experience.
<br /> <br />
+2 -1
View File
@@ -6,12 +6,13 @@ from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from jose import jwt, JWTError from jose import jwt, JWTError
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from app.core.security import SECRET_KEY, ALGORITHM from app.core.security import SECRET_KEY, ALGORITHM
from app.core.data_root import get_data_root
from litellm import completion from litellm import completion
router = APIRouter() router = APIRouter()
security = HTTPBearer() security = HTTPBearer()
DATA_FILE = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "llm_config.json") DATA_FILE = str(get_data_root() / "llm_config.json")
class CurrentUser(BaseModel): class CurrentUser(BaseModel):
+9 -5
View File
@@ -10,14 +10,15 @@ from datetime import datetime
from fastapi import APIRouter, HTTPException, UploadFile, File, Form from fastapi import APIRouter, HTTPException, UploadFile, File, Form
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from app.core.data_root import get_data_root, get_workspace_root
router = APIRouter() router = APIRouter()
BASE_DIR = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) DATA_FILE = str(get_data_root() / "skills.json")
DATA_FILE = os.path.join(BASE_DIR, "data", "skills.json") SKILL_HUB_DIR = str(get_workspace_root() / "skills")
SKILL_HUB_DIR = os.path.join(BASE_DIR, "data", "workspace", "skills")
# Ensure skill-hub directory exists def _ensure_skill_hub_dir() -> None:
os.makedirs(SKILL_HUB_DIR, exist_ok=True) os.makedirs(SKILL_HUB_DIR, exist_ok=True)
class Skill(BaseModel): class Skill(BaseModel):
id: str = Field(..., description="Unique identifier for the skill") id: str = Field(..., description="Unique identifier for the skill")
@@ -134,6 +135,7 @@ def _write_skill_markdown(skill_dir: str, skill_name: str, description: Optional
return skill_md_path return skill_md_path
def load_skills(project_id: Optional[int] = None) -> List[Dict[str, Any]]: def load_skills(project_id: Optional[int] = None) -> List[Dict[str, Any]]:
_ensure_skill_hub_dir()
data = _load_data() data = _load_data()
registered_paths = set() registered_paths = set()
@@ -208,6 +210,7 @@ async def upload_skill(
"""Upload a skill file (SKILL.md) or a packaged skill (zip/tar.gz).""" """Upload a skill file (SKILL.md) or a packaged skill (zip/tar.gz)."""
filename = file.filename filename = file.filename
print(f"Uploading skill: {filename}, project_id: {project_id}") print(f"Uploading skill: {filename}, project_id: {project_id}")
_ensure_skill_hub_dir()
# Create a unique temp directory # Create a unique temp directory
temp_dir_name = f"temp_{datetime.now().timestamp()}_{os.urandom(4).hex()}" temp_dir_name = f"temp_{datetime.now().timestamp()}_{os.urandom(4).hex()}"
@@ -323,6 +326,7 @@ async def upload_skill(
@router.post("/skills", response_model=Skill) @router.post("/skills", response_model=Skill)
def create_skill(skill: SkillCreate): def create_skill(skill: SkillCreate):
_ensure_skill_hub_dir()
data = load_skills() data = load_skills()
if any(item["id"] == skill.id for item in data): if any(item["id"] == skill.id for item in data):
raise HTTPException(status_code=400, detail="Skill with this ID already exists") raise HTTPException(status_code=400, detail="Skill with this ID already exists")
+4 -3
View File
@@ -3,14 +3,15 @@ import pandas as pd
import duckdb import duckdb
import io import io
import uuid import uuid
from pathlib import Path
from app.core.data_root import get_uploads_root
router = APIRouter() router = APIRouter()
upload_dir = Path(__file__).resolve().parents[2] / "data" / "uploads" upload_dir = get_uploads_root()
upload_dir.mkdir(parents=True, exist_ok=True)
@router.post("/upload/file") @router.post("/upload/file")
async def upload_file(file: UploadFile = File(...)): async def upload_file(file: UploadFile = File(...)):
upload_dir.mkdir(parents=True, exist_ok=True)
allowed_extensions = ('.csv', '.xls', '.xlsx', '.parquet', '.db', '.sqlite', '.sqlite3') allowed_extensions = ('.csv', '.xls', '.xlsx', '.parquet', '.db', '.sqlite', '.sqlite3')
filename_lower = file.filename.lower() filename_lower = file.filename.lower()
if not filename_lower.endswith(allowed_extensions): if not filename_lower.endswith(allowed_extensions):
+8 -7
View File
@@ -6,6 +6,7 @@ from urllib.parse import quote
from pydantic import BaseModel from pydantic import BaseModel
from app.core.data_root import get_data_root, get_reports_root, get_uploads_root, get_workspace_root
LOCAL_URI_PATTERN = re.compile(r"local://[^\s<>'\"\]\)\}]+") LOCAL_URI_PATTERN = re.compile(r"local://[^\s<>'\"\]\)\}]+")
PATH_PATTERN = re.compile( PATH_PATTERN = re.compile(
@@ -138,11 +139,11 @@ def _normalize_locator(raw_locator: str) -> str:
def _resolve_locator(locator: str) -> Path | None: def _resolve_locator(locator: str) -> Path | None:
backend_root = Path(__file__).resolve().parents[2] data_root = get_data_root()
data_root = backend_root / "data" workspace_root = get_workspace_root()
workspace_root = data_root / "workspace" uploads_root = get_uploads_root()
uploads_root = data_root / "uploads" reports_root = get_reports_root()
reports_root = data_root / "data" repo_root = data_root.parent
if locator.startswith("local://"): if locator.startswith("local://"):
raw_local = locator.replace("local://", "", 1).strip().lstrip("/\\") raw_local = locator.replace("local://", "", 1).strip().lstrip("/\\")
if not raw_local: if not raw_local:
@@ -160,11 +161,11 @@ def _resolve_locator(locator: str) -> Path | None:
if path.is_absolute(): if path.is_absolute():
return path return path
if normalized.startswith("data/data/"): if normalized.startswith("data/data/"):
return backend_root / normalized return repo_root / normalized
checks = [ checks = [
workspace_root / normalized, workspace_root / normalized,
data_root / normalized, data_root / normalized,
backend_root / normalized, repo_root / normalized,
] ]
for candidate in checks: for candidate in checks:
if candidate.exists(): if candidate.exists():
+39
View File
@@ -0,0 +1,39 @@
import os
from pathlib import Path
BACKEND_ROOT = Path(__file__).resolve().parents[2]
REPO_ROOT = BACKEND_ROOT.parent
DEFAULT_DATA_ROOT = REPO_ROOT / "data"
LEGACY_DATA_ROOT = BACKEND_ROOT / "data"
def get_data_root() -> Path:
configured = (os.getenv("DATA_ROOT") or "").strip()
if configured:
return Path(configured).expanduser().resolve()
if DEFAULT_DATA_ROOT.exists():
return DEFAULT_DATA_ROOT
if LEGACY_DATA_ROOT.exists():
print(f"[DATA_ROOT] legacy path detected: {LEGACY_DATA_ROOT}. Please migrate to {DEFAULT_DATA_ROOT}.")
return LEGACY_DATA_ROOT
return DEFAULT_DATA_ROOT
def get_workspace_root() -> Path:
return get_data_root() / "workspace"
def get_uploads_root() -> Path:
return get_data_root() / "uploads"
def get_reports_root() -> Path:
return get_data_root() / "data"
def ensure_data_layout() -> None:
get_data_root().mkdir(parents=True, exist_ok=True)
get_workspace_root().mkdir(parents=True, exist_ok=True)
get_uploads_root().mkdir(parents=True, exist_ok=True)
get_reports_root().mkdir(parents=True, exist_ok=True)
+7 -6
View File
@@ -2,12 +2,13 @@ import os
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from app.core.data_root import get_data_root, get_reports_root, get_uploads_root, get_workspace_root
backend_root = Path(__file__).resolve().parents[2]
data_root = backend_root / "data" data_root = get_data_root()
workspace_root = data_root / "workspace" workspace_root = get_workspace_root()
uploads_root = data_root / "uploads" uploads_root = get_uploads_root()
reports_root = data_root / "data" reports_root = get_reports_root()
allowed_artifact_roots = (workspace_root, uploads_root, reports_root) allowed_artifact_roots = (workspace_root, uploads_root, reports_root)
@@ -50,7 +51,7 @@ def resolve_artifact_target(target: str) -> Path | None:
if path.is_absolute(): if path.is_absolute():
return path return path
if normalized.startswith("data/data/"): if normalized.startswith("data/data/"):
return backend_root / normalized return data_root.parent / normalized
checks = ( checks = (
workspace_root / normalized, workspace_root / normalized,
data_root / normalized, data_root / normalized,
+2 -2
View File
@@ -34,6 +34,7 @@ from nanobot.config.schema import Config
from app.api.skills import load_skills from app.api.skills import load_skills
from app.services.llm_cache import get_llm_configs from app.services.llm_cache import get_llm_configs
from app.core.data_root import get_workspace_root
from app.core.streaming_provider import StreamingLiteLLMProvider from app.core.streaming_provider import StreamingLiteLLMProvider
class NanobotIntegration: class NanobotIntegration:
@@ -47,8 +48,7 @@ class NanobotIntegration:
self._model_agent_lock = asyncio.Lock() self._model_agent_lock = asyncio.Lock()
def initialize(self): def initialize(self):
# Set workspace path to backend/data/workspace workspace_path = get_workspace_root()
workspace_path = Path(os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "data", "workspace"))
workspace_path.mkdir(parents=True, exist_ok=True) workspace_path.mkdir(parents=True, exist_ok=True)
self._sync_builtin_skills_to_workspace(workspace_path) self._sync_builtin_skills_to_workspace(workspace_path)
+11 -4
View File
@@ -5,14 +5,21 @@ from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
from app.core.data_root import get_data_root
class SessionAliasStore: class SessionAliasStore:
def __init__(self) -> None: def __init__(self) -> None:
backend_root = Path(__file__).resolve().parents[2] data_dir = get_data_root()
data_dir = backend_root / "data" try:
data_dir.mkdir(parents=True, exist_ok=True) data_dir.mkdir(parents=True, exist_ok=True)
except PermissionError as exc:
raise RuntimeError(f"DATA_ROOT 权限不足: {data_dir}") from exc
self.db_path = data_dir / "nanobot_sessions.db" self.db_path = data_dir / "nanobot_sessions.db"
self._init_db() try:
self._init_db()
except PermissionError as exc:
raise RuntimeError(f"DATA_ROOT 权限不足: {data_dir}") from exc
def _connect(self) -> sqlite3.Connection: def _connect(self) -> sqlite3.Connection:
conn = sqlite3.connect(str(self.db_path)) conn = sqlite3.connect(str(self.db_path))
+2 -2
View File
@@ -6,9 +6,9 @@ from app.models.datasource import DataSource
from app.schemas.mdl import MDLManifest, Model, Column, TableReference from app.schemas.mdl import MDLManifest, Model, Column, TableReference
from app.connectors.factory import get_connector from app.connectors.factory import get_connector
from app.database import SessionLocal from app.database import SessionLocal
from app.core.data_root import get_data_root
# Assuming running from backend/ directory MDL_STORAGE_PATH = get_data_root() / "mdl"
MDL_STORAGE_PATH = Path("data/mdl")
class MDLService: class MDLService:
@staticmethod @staticmethod
+14 -3
View File
@@ -20,6 +20,7 @@ from app.api import upload, llm, skills, users, datasources, projects, semantic
from app.connectors.postgres import postgres_connector from app.connectors.postgres import postgres_connector
from app.connectors.clickhouse import clickhouse_connector from app.connectors.clickhouse import clickhouse_connector
from app.core.artifacts import extract_artifacts from app.core.artifacts import extract_artifacts
from app.core.data_root import ensure_data_layout, get_data_root, get_reports_root
from app.core.files import ensure_artifact_access, resolve_artifact_target from app.core.files import ensure_artifact_access, resolve_artifact_target
from app.core.nanobot import nanobot_service from app.core.nanobot import nanobot_service
from app.core.session_alias_store import session_alias_store from app.core.session_alias_store import session_alias_store
@@ -44,9 +45,12 @@ app.add_middleware(
Base.metadata.create_all(bind=engine) Base.metadata.create_all(bind=engine)
# Mount static directory for reports # Mount static directory for reports
data_dir = os.path.join(os.path.dirname(__file__), "data", "data") try:
os.makedirs(data_dir, exist_ok=True) ensure_data_layout()
app.mount("/reports", StaticFiles(directory=data_dir), name="reports") except Exception as e:
raise RuntimeError(f"DATA_ROOT 初始化失败: {e}") from e
reports_dir = get_reports_root()
app.mount("/reports", StaticFiles(directory=str(reports_dir)), name="reports")
app.include_router(upload.router, prefix="/api/v1") app.include_router(upload.router, prefix="/api/v1")
app.include_router(llm.router, prefix="/api/v1") app.include_router(llm.router, prefix="/api/v1")
@@ -71,6 +75,13 @@ PREVIEWABLE_TEXT_EXTENSIONS = {
@app.on_event("startup") @app.on_event("startup")
async def startup_event(): async def startup_event():
try:
data_root = get_data_root()
data_root.mkdir(parents=True, exist_ok=True)
if not os.access(data_root, os.R_OK | os.W_OK | os.X_OK):
raise RuntimeError(f"DATA_ROOT 权限不足: {data_root}")
except Exception as e:
raise RuntimeError(f"DATA_ROOT 初始化失败: {e}") from e
# Initialize nanobot in background # Initialize nanobot in background
try: try:
await nanobot_service.start() await nanobot_service.start()
+2 -1
View File
@@ -3,11 +3,12 @@ from pathlib import Path
import pytest import pytest
from fastapi.testclient import TestClient from fastapi.testclient import TestClient
from app.core.data_root import get_data_root
from main import app from main import app
def _backend_data_root() -> Path: def _backend_data_root() -> Path:
return Path(__file__).resolve().parents[1] / "data" return get_data_root()
def test_download_artifact_within_whitelist() -> None: def test_download_artifact_within_whitelist() -> None:
+2 -1
View File
@@ -1,10 +1,11 @@
from pathlib import Path from pathlib import Path
from app.core.artifacts import extract_artifacts from app.core.artifacts import extract_artifacts
from app.core.data_root import get_data_root
def _backend_data_root() -> Path: def _backend_data_root() -> Path:
return Path(__file__).resolve().parents[1] / "data" return get_data_root()
def test_extract_artifacts_from_local_and_tool_paths() -> None: def test_extract_artifacts_from_local_and_tool_paths() -> None:
+28
View File
@@ -0,0 +1,28 @@
from pathlib import Path
from app.core import data_root
def test_data_root_prefers_env(monkeypatch, tmp_path: Path) -> None:
custom = tmp_path / "custom-data-root"
monkeypatch.setenv("DATA_ROOT", str(custom))
assert data_root.get_data_root() == custom.resolve()
def test_data_root_falls_back_to_legacy(monkeypatch, tmp_path: Path) -> None:
monkeypatch.delenv("DATA_ROOT", raising=False)
legacy = tmp_path / "legacy-data"
default = tmp_path / "default-data"
legacy.mkdir(parents=True, exist_ok=True)
monkeypatch.setattr(data_root, "LEGACY_DATA_ROOT", legacy)
monkeypatch.setattr(data_root, "DEFAULT_DATA_ROOT", default)
assert data_root.get_data_root() == legacy
def test_ensure_data_layout_creates_children(monkeypatch, tmp_path: Path) -> None:
monkeypatch.setenv("DATA_ROOT", str(tmp_path / "dr"))
data_root.ensure_data_layout()
root = data_root.get_data_root()
assert (root / "workspace").exists()
assert (root / "uploads").exists()
assert (root / "data").exists()
+26 -2
View File
@@ -1114,8 +1114,20 @@ export function ChatInterface() {
<iframe <iframe
title={`report-${msg.id}`} title={`report-${msg.id}`}
srcDoc={reportHtml} srcDoc={reportHtml}
sandbox="allow-same-origin" sandbox="allow-same-origin allow-scripts"
className="w-full h-[620px] bg-white" className="w-full h-[620px] bg-white"
onLoad={(e) => {
try {
const doc = (e.target as HTMLIFrameElement).contentDocument;
if (doc) {
const style = doc.createElement('style');
style.textContent = `html, body { overflow: auto !important; }`;
doc.head.appendChild(style);
}
} catch (err) {
console.error("Failed to inject styles", err);
}
}}
/> />
</div> </div>
) : null} ) : null}
@@ -1371,7 +1383,19 @@ export function ChatInterface() {
<iframe <iframe
title={artifactPreview.name} title={artifactPreview.name}
src={artifactPreview.previewUrl} src={artifactPreview.previewUrl}
className="w-full h-full" className="w-full h-full border-0"
onLoad={(e) => {
try {
const doc = (e.target as HTMLIFrameElement).contentDocument;
if (doc) {
const style = doc.createElement('style');
style.textContent = `html, body { overflow: auto !important; }`;
doc.head.appendChild(style);
}
} catch (err) {
console.error("Failed to inject styles into iframe", err);
}
}}
/> />
) : null} ) : null}
</div> </div>