Update 2026-05-13 16:43:53
This commit is contained in:
@@ -0,0 +1,50 @@
|
||||
from clickhouse_driver import Client
|
||||
import os
|
||||
|
||||
class ClickHouseConnector:
|
||||
def __init__(self, host: str = None, port: int = 9000, user: str = 'default', password: str = '', database: str = 'default'):
|
||||
self.host = host or os.getenv("CLICKHOUSE_HOST", "localhost")
|
||||
self.port = port or int(os.getenv("CLICKHOUSE_PORT", 9000))
|
||||
self.user = user or os.getenv("CLICKHOUSE_USER", "default")
|
||||
self.password = password or os.getenv("CLICKHOUSE_PASSWORD", "")
|
||||
self.database = database or os.getenv("CLICKHOUSE_DB", "default")
|
||||
|
||||
self.client = Client(
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
user=self.user,
|
||||
password=self.password,
|
||||
database=self.database
|
||||
)
|
||||
|
||||
def execute_query(self, query: str):
|
||||
try:
|
||||
return self.client.execute(query, with_column_types=True)
|
||||
except Exception as e:
|
||||
print(f"ClickHouse Query Error: {e}")
|
||||
raise e
|
||||
|
||||
def get_schema(self):
|
||||
query = "SELECT table, name, type FROM system.columns WHERE database = currentDatabase()"
|
||||
try:
|
||||
results = self.client.execute(query)
|
||||
schema = {}
|
||||
for row in results:
|
||||
table = row[0]
|
||||
if table not in schema:
|
||||
schema[table] = []
|
||||
schema[table].append({"name": row[1], "type": row[2]})
|
||||
return schema
|
||||
except Exception as e:
|
||||
print(f"Error getting schema: {e}")
|
||||
return {}
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
try:
|
||||
self.client.execute("SELECT 1")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"ClickHouse Connection Error: {e}")
|
||||
return False
|
||||
|
||||
clickhouse_connector = ClickHouseConnector()
|
||||
@@ -0,0 +1,68 @@
|
||||
import duckdb
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any
|
||||
import os
|
||||
from app.core.files import resolve_upload_file_path
|
||||
|
||||
class CSVConnector:
|
||||
def __init__(self, file_path: str):
|
||||
self.file_path = file_path
|
||||
if not os.path.exists(self.file_path):
|
||||
raise FileNotFoundError(f"CSV file not found: {self.file_path}")
|
||||
|
||||
def _get_table_name(self) -> str:
|
||||
# Normalize table name to be SQL safe-ish
|
||||
base = os.path.splitext(os.path.basename(self.file_path))[0]
|
||||
# Replace non-alphanumeric chars with underscore
|
||||
safe_name = "".join([c if c.isalnum() else "_" for c in base])
|
||||
# Ensure it doesn't start with a number
|
||||
if safe_name and safe_name[0].isdigit():
|
||||
safe_name = f"t_{safe_name}"
|
||||
return safe_name
|
||||
|
||||
def execute_query(self, query: str) -> List[Dict[str, Any]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
table_name = self._get_table_name()
|
||||
|
||||
# Register the csv file as a view
|
||||
# read_csv_auto is powerful
|
||||
try:
|
||||
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
|
||||
|
||||
# Execute the user query
|
||||
# The query should rely on the table name provided in schema
|
||||
df = conn.execute(query).df()
|
||||
return df.to_dict(orient="records")
|
||||
except Exception as e:
|
||||
print(f"CSV Query Error: {e}")
|
||||
raise e
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
table_name = self._get_table_name()
|
||||
|
||||
try:
|
||||
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
|
||||
|
||||
# Get columns
|
||||
columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
||||
# col[0] is name, col[1] is type
|
||||
schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
|
||||
return schema
|
||||
except Exception as e:
|
||||
print(f"Error getting schema: {e}")
|
||||
return {}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
try:
|
||||
conn = duckdb.connect(":memory:")
|
||||
conn.execute(f"SELECT * FROM read_csv_auto('{self.file_path}') LIMIT 1")
|
||||
conn.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"CSV Connection Error: {e}")
|
||||
return False
|
||||
@@ -0,0 +1,48 @@
|
||||
import duckdb
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any, Optional
|
||||
import os
|
||||
|
||||
class DuckDBConnector:
|
||||
def __init__(self, db_path: str = ":memory:"):
|
||||
self.db_path = db_path
|
||||
|
||||
def execute_query(self, query: str) -> List[Dict[str, Any]]:
|
||||
conn = duckdb.connect(self.db_path)
|
||||
try:
|
||||
df = conn.execute(query).df()
|
||||
return df.to_dict(orient="records")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_schema(self) -> Dict[str, Any]:
|
||||
conn = duckdb.connect(self.db_path)
|
||||
try:
|
||||
schema = {}
|
||||
tables = conn.execute("SHOW TABLES").fetchall()
|
||||
for (table_name,) in tables:
|
||||
columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
||||
columns = []
|
||||
for col in columns_info:
|
||||
columns.append({
|
||||
"name": col[0],
|
||||
"type": col[1]
|
||||
})
|
||||
schema[table_name] = {
|
||||
"columns": columns,
|
||||
"primary_keys": [], # DuckDB describe doesn't easily show PKs in this format
|
||||
"foreign_keys": []
|
||||
}
|
||||
return schema
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
try:
|
||||
conn = duckdb.connect(self.db_path)
|
||||
conn.execute("SELECT 1")
|
||||
conn.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"DuckDB Connection Error: {e}")
|
||||
return False
|
||||
@@ -0,0 +1,82 @@
|
||||
from typing import Dict, Any, Optional
|
||||
import json
|
||||
import functools
|
||||
from app.connectors.postgres import PostgresConnector
|
||||
from app.connectors.clickhouse import ClickHouseConnector
|
||||
from app.connectors.parquet import ParquetConnector
|
||||
from app.connectors.csv import CSVConnector
|
||||
from app.connectors.duckdb import DuckDBConnector
|
||||
from app.models.datasource import DataSource
|
||||
from app.core.files import resolve_upload_file_path
|
||||
|
||||
@functools.lru_cache(maxsize=32)
|
||||
def _get_cached_connector(ds_type: str, config_json: str):
|
||||
config = json.loads(config_json)
|
||||
|
||||
if ds_type in ["postgres", "postgresql", "supabase"]:
|
||||
db_url = config.get("connection_string")
|
||||
if not db_url:
|
||||
default_port = 6543 if ds_type == "supabase" else 5432
|
||||
port = config.get("port") or default_port
|
||||
db_url = f"postgresql://{config.get('user')}:{config.get('password')}@{config.get('host')}:{port}/{config.get('database')}"
|
||||
|
||||
if ds_type == "supabase" and "?" not in db_url:
|
||||
db_url += "?sslmode=require"
|
||||
elif ds_type == "supabase" and "sslmode=" not in db_url:
|
||||
db_url += "&sslmode=require"
|
||||
|
||||
return PostgresConnector(db_url=db_url)
|
||||
|
||||
elif ds_type == "mysql":
|
||||
db_url = config.get("connection_string")
|
||||
if not db_url:
|
||||
port = config.get("port") or 3306
|
||||
db_url = f"mysql+pymysql://{config.get('user')}:{config.get('password')}@{config.get('host')}:{port}/{config.get('database')}"
|
||||
elif not db_url.startswith("mysql+pymysql://"):
|
||||
db_url = db_url.replace("mysql://", "mysql+pymysql://")
|
||||
return PostgresConnector(db_url=db_url)
|
||||
|
||||
elif ds_type == "sqlite":
|
||||
# SQLite uses connection string usually file path
|
||||
db_url = config.get("connection_string")
|
||||
if not db_url and config.get("file_path"):
|
||||
file_path = str(resolve_upload_file_path(config.get("file_path")))
|
||||
db_url = f"sqlite:///{file_path}"
|
||||
return PostgresConnector(db_url=db_url)
|
||||
|
||||
elif ds_type == "clickhouse":
|
||||
return ClickHouseConnector(
|
||||
host=config.get("host"),
|
||||
port=config.get("port", 9000),
|
||||
user=config.get("user", "default"),
|
||||
password=config.get("password", ""),
|
||||
database=config.get("database", "default")
|
||||
)
|
||||
|
||||
elif ds_type == "duckdb":
|
||||
db_path = config.get("database") or config.get("file_path") or ":memory:"
|
||||
if db_path != ":memory:":
|
||||
db_path = str(resolve_upload_file_path(db_path))
|
||||
return DuckDBConnector(db_path=db_path)
|
||||
|
||||
elif ds_type == "parquet":
|
||||
file_path = str(resolve_upload_file_path(config.get("file_path")))
|
||||
return ParquetConnector(file_path=file_path)
|
||||
|
||||
elif ds_type == "csv":
|
||||
file_path = str(resolve_upload_file_path(config.get("file_path")))
|
||||
return CSVConnector(file_path=file_path)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported data source type: {ds_type}")
|
||||
|
||||
def get_connector(datasource: DataSource):
|
||||
# Use JSON string of config as cache key
|
||||
# Ensure stable ordering of keys
|
||||
config_str = json.dumps(datasource.config, sort_keys=True)
|
||||
return _get_cached_connector(datasource.type.lower(), config_str)
|
||||
|
||||
def get_connector_from_config(ds_type: str, config: Dict[str, Any]):
|
||||
# Helper for testing connection without saving to DB
|
||||
config_str = json.dumps(config, sort_keys=True)
|
||||
return _get_cached_connector(ds_type.lower(), config_str)
|
||||
@@ -0,0 +1,58 @@
|
||||
import duckdb
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any
|
||||
import os
|
||||
|
||||
class ParquetConnector:
|
||||
def __init__(self, file_path: str):
|
||||
self.file_path = file_path
|
||||
if not os.path.exists(self.file_path):
|
||||
raise FileNotFoundError(f"Parquet file not found: {self.file_path}")
|
||||
|
||||
def execute_query(self, query: str) -> List[Dict[str, Any]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
# Register the parquet file as a view or table
|
||||
# We can use read_parquet directly in query, or register it.
|
||||
# Let's register it as 'parquet_table' for simplicity in generated SQL,
|
||||
# or we can ask LLM to use the filename.
|
||||
# A better approach for generic SQL is to register it as a table name derived from filename or just 'data'.
|
||||
table_name = os.path.splitext(os.path.basename(self.file_path))[0]
|
||||
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
|
||||
|
||||
# If the query doesn't use the table name, we might have issues.
|
||||
# But usually we provide schema with table name to LLM.
|
||||
try:
|
||||
# DuckDB returns a dataframe, we convert to dict
|
||||
df = conn.execute(query).df()
|
||||
return df.to_dict(orient="records")
|
||||
except Exception as e:
|
||||
print(f"Parquet Query Error: {e}")
|
||||
raise e
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
table_name = os.path.splitext(os.path.basename(self.file_path))[0]
|
||||
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
|
||||
|
||||
try:
|
||||
# Get columns
|
||||
columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
||||
schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
|
||||
return schema
|
||||
except Exception as e:
|
||||
print(f"Error getting schema: {e}")
|
||||
return {}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
try:
|
||||
conn = duckdb.connect(":memory:")
|
||||
conn.execute(f"SELECT * FROM read_parquet('{self.file_path}') LIMIT 1")
|
||||
conn.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Parquet Connection Error: {e}")
|
||||
return False
|
||||
@@ -0,0 +1,113 @@
|
||||
from sqlalchemy import create_engine, text
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from typing import Generator
|
||||
import os
|
||||
|
||||
class PostgresConnector:
|
||||
def __init__(self, db_url: str = None):
|
||||
self.db_url = db_url or os.getenv("POSTGRES_URL", "postgresql://user:password@localhost:5432/dbname")
|
||||
self.engine = create_engine(self.db_url)
|
||||
self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine)
|
||||
|
||||
def get_db(self) -> Generator:
|
||||
db = self.SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
def execute_query(self, query: str):
|
||||
with self.engine.connect() as connection:
|
||||
result = connection.execute(text(query))
|
||||
return [dict(row._mapping) for row in result]
|
||||
|
||||
def get_schema(self):
|
||||
try:
|
||||
from sqlalchemy import inspect
|
||||
inspector = inspect(self.engine)
|
||||
schema = {}
|
||||
# Default schema for postgres is 'public', sqlite is None
|
||||
schema_name = 'public' if self.engine.dialect.name == 'postgresql' else None
|
||||
|
||||
table_names = inspector.get_table_names(schema=schema_name)
|
||||
|
||||
# Use SQLAlchemy 2.0+ multi-fetch to avoid N+1 queries issue, especially over remote networks
|
||||
if hasattr(inspector, 'get_multi_columns'):
|
||||
multi_columns = inspector.get_multi_columns(schema=schema_name)
|
||||
multi_pk = inspector.get_multi_pk_constraint(schema=schema_name)
|
||||
multi_fk = inspector.get_multi_foreign_keys(schema=schema_name)
|
||||
|
||||
for table_name in table_names:
|
||||
key = (schema_name, table_name)
|
||||
|
||||
columns = []
|
||||
for col in multi_columns.get(key, []):
|
||||
columns.append({
|
||||
"name": col['name'],
|
||||
"type": str(col['type'])
|
||||
})
|
||||
|
||||
pk_constraint = multi_pk.get(key)
|
||||
pks = pk_constraint.get('constrained_columns', []) if pk_constraint else []
|
||||
|
||||
foreign_keys = []
|
||||
for fk in multi_fk.get(key, []):
|
||||
foreign_keys.append({
|
||||
"constrained_columns": fk['constrained_columns'],
|
||||
"referred_table": fk['referred_table'],
|
||||
"referred_columns": fk['referred_columns']
|
||||
})
|
||||
|
||||
schema[table_name] = {
|
||||
"columns": columns,
|
||||
"primary_keys": pks,
|
||||
"foreign_keys": foreign_keys
|
||||
}
|
||||
return schema
|
||||
|
||||
# Fallback for older SQLAlchemy versions
|
||||
for table_name in table_names:
|
||||
columns = []
|
||||
# get columns
|
||||
for col in inspector.get_columns(table_name, schema=schema_name):
|
||||
columns.append({
|
||||
"name": col['name'],
|
||||
"type": str(col['type'])
|
||||
})
|
||||
|
||||
# get primary key
|
||||
pk_constraint = inspector.get_pk_constraint(table_name, schema=schema_name)
|
||||
pks = pk_constraint.get('constrained_columns', []) if pk_constraint else []
|
||||
|
||||
# get foreign keys
|
||||
fks = inspector.get_foreign_keys(table_name, schema=schema_name)
|
||||
foreign_keys = []
|
||||
for fk in fks:
|
||||
foreign_keys.append({
|
||||
"constrained_columns": fk['constrained_columns'],
|
||||
"referred_table": fk['referred_table'],
|
||||
"referred_columns": fk['referred_columns']
|
||||
})
|
||||
|
||||
schema[table_name] = {
|
||||
"columns": columns,
|
||||
"primary_keys": pks,
|
||||
"foreign_keys": foreign_keys
|
||||
}
|
||||
return schema
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(f"Error getting schema: {e}")
|
||||
raise e
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
try:
|
||||
with self.engine.connect() as connection:
|
||||
connection.execute(text("SELECT 1"))
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"PostgreSQL Connection Error: {e}")
|
||||
raise e
|
||||
|
||||
postgres_connector = PostgresConnector()
|
||||
Reference in New Issue
Block a user