Update 2026-05-13 16:43:53

This commit is contained in:
yi
2026-05-13 16:43:53 +08:00
parent 6af5c584f4
commit afd7c5fe85
490 changed files with 850 additions and 922 deletions
+50
View File
@@ -0,0 +1,50 @@
from clickhouse_driver import Client
import os
class ClickHouseConnector:
def __init__(self, host: str = None, port: int = 9000, user: str = 'default', password: str = '', database: str = 'default'):
self.host = host or os.getenv("CLICKHOUSE_HOST", "localhost")
self.port = port or int(os.getenv("CLICKHOUSE_PORT", 9000))
self.user = user or os.getenv("CLICKHOUSE_USER", "default")
self.password = password or os.getenv("CLICKHOUSE_PASSWORD", "")
self.database = database or os.getenv("CLICKHOUSE_DB", "default")
self.client = Client(
host=self.host,
port=self.port,
user=self.user,
password=self.password,
database=self.database
)
def execute_query(self, query: str):
try:
return self.client.execute(query, with_column_types=True)
except Exception as e:
print(f"ClickHouse Query Error: {e}")
raise e
def get_schema(self):
query = "SELECT table, name, type FROM system.columns WHERE database = currentDatabase()"
try:
results = self.client.execute(query)
schema = {}
for row in results:
table = row[0]
if table not in schema:
schema[table] = []
schema[table].append({"name": row[1], "type": row[2]})
return schema
except Exception as e:
print(f"Error getting schema: {e}")
return {}
def test_connection(self) -> bool:
try:
self.client.execute("SELECT 1")
return True
except Exception as e:
print(f"ClickHouse Connection Error: {e}")
return False
clickhouse_connector = ClickHouseConnector()
+68
View File
@@ -0,0 +1,68 @@
import duckdb
import pandas as pd
from typing import List, Dict, Any
import os
from app.core.files import resolve_upload_file_path
class CSVConnector:
def __init__(self, file_path: str):
self.file_path = file_path
if not os.path.exists(self.file_path):
raise FileNotFoundError(f"CSV file not found: {self.file_path}")
def _get_table_name(self) -> str:
# Normalize table name to be SQL safe-ish
base = os.path.splitext(os.path.basename(self.file_path))[0]
# Replace non-alphanumeric chars with underscore
safe_name = "".join([c if c.isalnum() else "_" for c in base])
# Ensure it doesn't start with a number
if safe_name and safe_name[0].isdigit():
safe_name = f"t_{safe_name}"
return safe_name
def execute_query(self, query: str) -> List[Dict[str, Any]]:
conn = duckdb.connect(":memory:")
table_name = self._get_table_name()
# Register the csv file as a view
# read_csv_auto is powerful
try:
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
# Execute the user query
# The query should rely on the table name provided in schema
df = conn.execute(query).df()
return df.to_dict(orient="records")
except Exception as e:
print(f"CSV Query Error: {e}")
raise e
finally:
conn.close()
def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
conn = duckdb.connect(":memory:")
table_name = self._get_table_name()
try:
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
# Get columns
columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
# col[0] is name, col[1] is type
schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
return schema
except Exception as e:
print(f"Error getting schema: {e}")
return {}
finally:
conn.close()
def test_connection(self) -> bool:
try:
conn = duckdb.connect(":memory:")
conn.execute(f"SELECT * FROM read_csv_auto('{self.file_path}') LIMIT 1")
conn.close()
return True
except Exception as e:
print(f"CSV Connection Error: {e}")
return False
+48
View File
@@ -0,0 +1,48 @@
import duckdb
import pandas as pd
from typing import List, Dict, Any, Optional
import os
class DuckDBConnector:
def __init__(self, db_path: str = ":memory:"):
self.db_path = db_path
def execute_query(self, query: str) -> List[Dict[str, Any]]:
conn = duckdb.connect(self.db_path)
try:
df = conn.execute(query).df()
return df.to_dict(orient="records")
finally:
conn.close()
def get_schema(self) -> Dict[str, Any]:
conn = duckdb.connect(self.db_path)
try:
schema = {}
tables = conn.execute("SHOW TABLES").fetchall()
for (table_name,) in tables:
columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
columns = []
for col in columns_info:
columns.append({
"name": col[0],
"type": col[1]
})
schema[table_name] = {
"columns": columns,
"primary_keys": [], # DuckDB describe doesn't easily show PKs in this format
"foreign_keys": []
}
return schema
finally:
conn.close()
def test_connection(self) -> bool:
try:
conn = duckdb.connect(self.db_path)
conn.execute("SELECT 1")
conn.close()
return True
except Exception as e:
print(f"DuckDB Connection Error: {e}")
return False
+82
View File
@@ -0,0 +1,82 @@
from typing import Dict, Any, Optional
import json
import functools
from app.connectors.postgres import PostgresConnector
from app.connectors.clickhouse import ClickHouseConnector
from app.connectors.parquet import ParquetConnector
from app.connectors.csv import CSVConnector
from app.connectors.duckdb import DuckDBConnector
from app.models.datasource import DataSource
from app.core.files import resolve_upload_file_path
@functools.lru_cache(maxsize=32)
def _get_cached_connector(ds_type: str, config_json: str):
config = json.loads(config_json)
if ds_type in ["postgres", "postgresql", "supabase"]:
db_url = config.get("connection_string")
if not db_url:
default_port = 6543 if ds_type == "supabase" else 5432
port = config.get("port") or default_port
db_url = f"postgresql://{config.get('user')}:{config.get('password')}@{config.get('host')}:{port}/{config.get('database')}"
if ds_type == "supabase" and "?" not in db_url:
db_url += "?sslmode=require"
elif ds_type == "supabase" and "sslmode=" not in db_url:
db_url += "&sslmode=require"
return PostgresConnector(db_url=db_url)
elif ds_type == "mysql":
db_url = config.get("connection_string")
if not db_url:
port = config.get("port") or 3306
db_url = f"mysql+pymysql://{config.get('user')}:{config.get('password')}@{config.get('host')}:{port}/{config.get('database')}"
elif not db_url.startswith("mysql+pymysql://"):
db_url = db_url.replace("mysql://", "mysql+pymysql://")
return PostgresConnector(db_url=db_url)
elif ds_type == "sqlite":
# SQLite uses connection string usually file path
db_url = config.get("connection_string")
if not db_url and config.get("file_path"):
file_path = str(resolve_upload_file_path(config.get("file_path")))
db_url = f"sqlite:///{file_path}"
return PostgresConnector(db_url=db_url)
elif ds_type == "clickhouse":
return ClickHouseConnector(
host=config.get("host"),
port=config.get("port", 9000),
user=config.get("user", "default"),
password=config.get("password", ""),
database=config.get("database", "default")
)
elif ds_type == "duckdb":
db_path = config.get("database") or config.get("file_path") or ":memory:"
if db_path != ":memory:":
db_path = str(resolve_upload_file_path(db_path))
return DuckDBConnector(db_path=db_path)
elif ds_type == "parquet":
file_path = str(resolve_upload_file_path(config.get("file_path")))
return ParquetConnector(file_path=file_path)
elif ds_type == "csv":
file_path = str(resolve_upload_file_path(config.get("file_path")))
return CSVConnector(file_path=file_path)
else:
raise ValueError(f"Unsupported data source type: {ds_type}")
def get_connector(datasource: DataSource):
# Use JSON string of config as cache key
# Ensure stable ordering of keys
config_str = json.dumps(datasource.config, sort_keys=True)
return _get_cached_connector(datasource.type.lower(), config_str)
def get_connector_from_config(ds_type: str, config: Dict[str, Any]):
# Helper for testing connection without saving to DB
config_str = json.dumps(config, sort_keys=True)
return _get_cached_connector(ds_type.lower(), config_str)
+58
View File
@@ -0,0 +1,58 @@
import duckdb
import pandas as pd
from typing import List, Dict, Any
import os
class ParquetConnector:
def __init__(self, file_path: str):
self.file_path = file_path
if not os.path.exists(self.file_path):
raise FileNotFoundError(f"Parquet file not found: {self.file_path}")
def execute_query(self, query: str) -> List[Dict[str, Any]]:
conn = duckdb.connect(":memory:")
# Register the parquet file as a view or table
# We can use read_parquet directly in query, or register it.
# Let's register it as 'parquet_table' for simplicity in generated SQL,
# or we can ask LLM to use the filename.
# A better approach for generic SQL is to register it as a table name derived from filename or just 'data'.
table_name = os.path.splitext(os.path.basename(self.file_path))[0]
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
# If the query doesn't use the table name, we might have issues.
# But usually we provide schema with table name to LLM.
try:
# DuckDB returns a dataframe, we convert to dict
df = conn.execute(query).df()
return df.to_dict(orient="records")
except Exception as e:
print(f"Parquet Query Error: {e}")
raise e
finally:
conn.close()
def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
conn = duckdb.connect(":memory:")
table_name = os.path.splitext(os.path.basename(self.file_path))[0]
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
try:
# Get columns
columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
return schema
except Exception as e:
print(f"Error getting schema: {e}")
return {}
finally:
conn.close()
def test_connection(self) -> bool:
try:
conn = duckdb.connect(":memory:")
conn.execute(f"SELECT * FROM read_parquet('{self.file_path}') LIMIT 1")
conn.close()
return True
except Exception as e:
print(f"Parquet Connection Error: {e}")
return False
+113
View File
@@ -0,0 +1,113 @@
from sqlalchemy import create_engine, text
from sqlalchemy.orm import sessionmaker
from typing import Generator
import os
class PostgresConnector:
def __init__(self, db_url: str = None):
self.db_url = db_url or os.getenv("POSTGRES_URL", "postgresql://user:password@localhost:5432/dbname")
self.engine = create_engine(self.db_url)
self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine)
def get_db(self) -> Generator:
db = self.SessionLocal()
try:
yield db
finally:
db.close()
def execute_query(self, query: str):
with self.engine.connect() as connection:
result = connection.execute(text(query))
return [dict(row._mapping) for row in result]
def get_schema(self):
try:
from sqlalchemy import inspect
inspector = inspect(self.engine)
schema = {}
# Default schema for postgres is 'public', sqlite is None
schema_name = 'public' if self.engine.dialect.name == 'postgresql' else None
table_names = inspector.get_table_names(schema=schema_name)
# Use SQLAlchemy 2.0+ multi-fetch to avoid N+1 queries issue, especially over remote networks
if hasattr(inspector, 'get_multi_columns'):
multi_columns = inspector.get_multi_columns(schema=schema_name)
multi_pk = inspector.get_multi_pk_constraint(schema=schema_name)
multi_fk = inspector.get_multi_foreign_keys(schema=schema_name)
for table_name in table_names:
key = (schema_name, table_name)
columns = []
for col in multi_columns.get(key, []):
columns.append({
"name": col['name'],
"type": str(col['type'])
})
pk_constraint = multi_pk.get(key)
pks = pk_constraint.get('constrained_columns', []) if pk_constraint else []
foreign_keys = []
for fk in multi_fk.get(key, []):
foreign_keys.append({
"constrained_columns": fk['constrained_columns'],
"referred_table": fk['referred_table'],
"referred_columns": fk['referred_columns']
})
schema[table_name] = {
"columns": columns,
"primary_keys": pks,
"foreign_keys": foreign_keys
}
return schema
# Fallback for older SQLAlchemy versions
for table_name in table_names:
columns = []
# get columns
for col in inspector.get_columns(table_name, schema=schema_name):
columns.append({
"name": col['name'],
"type": str(col['type'])
})
# get primary key
pk_constraint = inspector.get_pk_constraint(table_name, schema=schema_name)
pks = pk_constraint.get('constrained_columns', []) if pk_constraint else []
# get foreign keys
fks = inspector.get_foreign_keys(table_name, schema=schema_name)
foreign_keys = []
for fk in fks:
foreign_keys.append({
"constrained_columns": fk['constrained_columns'],
"referred_table": fk['referred_table'],
"referred_columns": fk['referred_columns']
})
schema[table_name] = {
"columns": columns,
"primary_keys": pks,
"foreign_keys": foreign_keys
}
return schema
except Exception as e:
import traceback
traceback.print_exc()
print(f"Error getting schema: {e}")
raise e
def test_connection(self) -> bool:
try:
with self.engine.connect() as connection:
connection.execute(text("SELECT 1"))
return True
except Exception as e:
print(f"PostgreSQL Connection Error: {e}")
raise e
postgres_connector = PostgresConnector()