feat: add data source
This commit is contained in:
@@ -19,7 +19,7 @@ class ClickHouseConnector:
|
||||
|
||||
def execute_query(self, query: str):
|
||||
try:
|
||||
return self.client.execute(query)
|
||||
return self.client.execute(query, with_column_types=True)
|
||||
except Exception as e:
|
||||
print(f"ClickHouse Query Error: {e}")
|
||||
raise e
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
from typing import Dict, Any, Optional
|
||||
import json
|
||||
import functools
|
||||
from app.connectors.postgres import PostgresConnector
|
||||
from app.connectors.clickhouse import ClickHouseConnector
|
||||
from app.connectors.parquet import ParquetConnector
|
||||
from app.models.datasource import DataSource
|
||||
|
||||
@functools.lru_cache(maxsize=32)
|
||||
def _get_cached_connector(ds_type: str, config_json: str):
|
||||
config = json.loads(config_json)
|
||||
|
||||
if ds_type in ["postgres", "postgresql", "supabase"]:
|
||||
# Supabase is just postgres
|
||||
db_url = config.get("connection_string") or \
|
||||
f"postgresql://{config.get('user')}:{config.get('password')}@{config.get('host')}:{config.get('port', 5432)}/{config.get('database')}"
|
||||
return PostgresConnector(db_url=db_url)
|
||||
|
||||
elif ds_type == "sqlite":
|
||||
# SQLite uses connection string usually file path
|
||||
db_url = config.get("connection_string")
|
||||
if not db_url and config.get("file_path"):
|
||||
db_url = f"sqlite:///{config.get('file_path')}"
|
||||
return PostgresConnector(db_url=db_url)
|
||||
|
||||
elif ds_type == "clickhouse":
|
||||
return ClickHouseConnector(
|
||||
host=config.get("host"),
|
||||
port=config.get("port", 9000),
|
||||
user=config.get("user", "default"),
|
||||
password=config.get("password", ""),
|
||||
database=config.get("database", "default")
|
||||
)
|
||||
|
||||
elif ds_type == "parquet":
|
||||
return ParquetConnector(file_path=config.get("file_path"))
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported data source type: {ds_type}")
|
||||
|
||||
def get_connector(datasource: DataSource):
|
||||
# Use JSON string of config as cache key
|
||||
# Ensure stable ordering of keys
|
||||
config_str = json.dumps(datasource.config, sort_keys=True)
|
||||
return _get_cached_connector(datasource.type.lower(), config_str)
|
||||
|
||||
def get_connector_from_config(ds_type: str, config: Dict[str, Any]):
|
||||
# Helper for testing connection without saving to DB
|
||||
# We can use the cached function too, or bypass if we want fresh check
|
||||
# Usually for testing we want fresh check, so let's bypass cache or clear it if needed.
|
||||
# But reusing cache is fine if config is same.
|
||||
config_str = json.dumps(config, sort_keys=True)
|
||||
return _get_cached_connector(ds_type.lower(), config_str)
|
||||
@@ -0,0 +1,58 @@
|
||||
import duckdb
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any
|
||||
import os
|
||||
|
||||
class ParquetConnector:
|
||||
def __init__(self, file_path: str):
|
||||
self.file_path = file_path
|
||||
if not os.path.exists(self.file_path):
|
||||
raise FileNotFoundError(f"Parquet file not found: {self.file_path}")
|
||||
|
||||
def execute_query(self, query: str) -> List[Dict[str, Any]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
# Register the parquet file as a view or table
|
||||
# We can use read_parquet directly in query, or register it.
|
||||
# Let's register it as 'parquet_table' for simplicity in generated SQL,
|
||||
# or we can ask LLM to use the filename.
|
||||
# A better approach for generic SQL is to register it as a table name derived from filename or just 'data'.
|
||||
table_name = os.path.splitext(os.path.basename(self.file_path))[0]
|
||||
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
|
||||
|
||||
# If the query doesn't use the table name, we might have issues.
|
||||
# But usually we provide schema with table name to LLM.
|
||||
try:
|
||||
# DuckDB returns a dataframe, we convert to dict
|
||||
df = conn.execute(query).df()
|
||||
return df.to_dict(orient="records")
|
||||
except Exception as e:
|
||||
print(f"Parquet Query Error: {e}")
|
||||
raise e
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_schema(self) -> Dict[str, List[str]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
table_name = os.path.splitext(os.path.basename(self.file_path))[0]
|
||||
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
|
||||
|
||||
try:
|
||||
# Get columns
|
||||
columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
||||
schema = {table_name: [f"{col[0]} ({col[1]})" for col in columns]}
|
||||
return schema
|
||||
except Exception as e:
|
||||
print(f"Error getting schema: {e}")
|
||||
return {}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
try:
|
||||
conn = duckdb.connect(":memory:")
|
||||
conn.execute(f"SELECT * FROM read_parquet('{self.file_path}') LIMIT 1")
|
||||
conn.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Parquet Connection Error: {e}")
|
||||
return False
|
||||
Reference in New Issue
Block a user