feat: add data source

2026-03-15 19:36:02 +08:00
parent 219944f059
commit f1db709aae
14 changed files with 851 additions and 22 deletions
@@ -19,7 +19,7 @@ class ClickHouseConnector:

    def execute_query(self, query: str):
        try:
-            return self.client.execute(query)
+            return self.client.execute(query, with_column_types=True)
        except Exception as e:
            print(f"ClickHouse Query Error: {e}")
            raise e
@@ -0,0 +1,53 @@
+from typing import Dict, Any, Optional
+import json
+import functools
+from app.connectors.postgres import PostgresConnector
+from app.connectors.clickhouse import ClickHouseConnector
+from app.connectors.parquet import ParquetConnector
+from app.models.datasource import DataSource
+
+@functools.lru_cache(maxsize=32)
+def _get_cached_connector(ds_type: str, config_json: str):
+    config = json.loads(config_json)
+    
+    if ds_type in ["postgres", "postgresql", "supabase"]:
+        # Supabase is just postgres
+        db_url = config.get("connection_string") or \
+                 f"postgresql://{config.get('user')}:{config.get('password')}@{config.get('host')}:{config.get('port', 5432)}/{config.get('database')}"
+        return PostgresConnector(db_url=db_url)
+        
+    elif ds_type == "sqlite":
+        # SQLite uses connection string usually file path
+        db_url = config.get("connection_string")
+        if not db_url and config.get("file_path"):
+             db_url = f"sqlite:///{config.get('file_path')}"
+        return PostgresConnector(db_url=db_url)
+
+    elif ds_type == "clickhouse":
+        return ClickHouseConnector(
+            host=config.get("host"),
+            port=config.get("port", 9000),
+            user=config.get("user", "default"),
+            password=config.get("password", ""),
+            database=config.get("database", "default")
+        )
+        
+    elif ds_type == "parquet":
+        return ParquetConnector(file_path=config.get("file_path"))
+        
+    else:
+        raise ValueError(f"Unsupported data source type: {ds_type}")
+
+def get_connector(datasource: DataSource):
+    # Use JSON string of config as cache key
+    # Ensure stable ordering of keys
+    config_str = json.dumps(datasource.config, sort_keys=True)
+    return _get_cached_connector(datasource.type.lower(), config_str)
+
+def get_connector_from_config(ds_type: str, config: Dict[str, Any]):
+    # Helper for testing connection without saving to DB
+    # We can use the cached function too, or bypass if we want fresh check
+    # Usually for testing we want fresh check, so let's bypass cache or clear it if needed.
+    # But reusing cache is fine if config is same.
+    config_str = json.dumps(config, sort_keys=True)
+    return _get_cached_connector(ds_type.lower(), config_str)
@@ -0,0 +1,58 @@
+import duckdb
+import pandas as pd
+from typing import List, Dict, Any
+import os
+
+class ParquetConnector:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        if not os.path.exists(self.file_path):
+             raise FileNotFoundError(f"Parquet file not found: {self.file_path}")
+
+    def execute_query(self, query: str) -> List[Dict[str, Any]]:
+        conn = duckdb.connect(":memory:")
+        # Register the parquet file as a view or table
+        # We can use read_parquet directly in query, or register it.
+        # Let's register it as 'parquet_table' for simplicity in generated SQL, 
+        # or we can ask LLM to use the filename.
+        # A better approach for generic SQL is to register it as a table name derived from filename or just 'data'.
+        table_name = os.path.splitext(os.path.basename(self.file_path))[0]
+        conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
+        
+        # If the query doesn't use the table name, we might have issues. 
+        # But usually we provide schema with table name to LLM.
+        try:
+            # DuckDB returns a dataframe, we convert to dict
+            df = conn.execute(query).df()
+            return df.to_dict(orient="records")
+        except Exception as e:
+            print(f"Parquet Query Error: {e}")
+            raise e
+        finally:
+            conn.close()
+
+    def get_schema(self) -> Dict[str, List[str]]:
+        conn = duckdb.connect(":memory:")
+        table_name = os.path.splitext(os.path.basename(self.file_path))[0]
+        conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
+        
+        try:
+            # Get columns
+            columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
+            schema = {table_name: [f"{col[0]} ({col[1]})" for col in columns]}
+            return schema
+        except Exception as e:
+            print(f"Error getting schema: {e}")
+            return {}
+        finally:
+            conn.close()
+
+    def test_connection(self) -> bool:
+        try:
+            conn = duckdb.connect(":memory:")
+            conn.execute(f"SELECT * FROM read_parquet('{self.file_path}') LIMIT 1")
+            conn.close()
+            return True
+        except Exception as e:
+            print(f"Parquet Connection Error: {e}")
+            return False