Update 2026-05-13 16:43:53

2026-05-13 16:43:53 +08:00
parent 6af5c584f4
commit afd7c5fe85
490 changed files with 850 additions and 922 deletions
@@ -0,0 +1,50 @@
+from clickhouse_driver import Client
+import os
+
+class ClickHouseConnector:
+    def __init__(self, host: str = None, port: int = 9000, user: str = 'default', password: str = '', database: str = 'default'):
+        self.host = host or os.getenv("CLICKHOUSE_HOST", "localhost")
+        self.port = port or int(os.getenv("CLICKHOUSE_PORT", 9000))
+        self.user = user or os.getenv("CLICKHOUSE_USER", "default")
+        self.password = password or os.getenv("CLICKHOUSE_PASSWORD", "")
+        self.database = database or os.getenv("CLICKHOUSE_DB", "default")
+        
+        self.client = Client(
+            host=self.host, 
+            port=self.port, 
+            user=self.user, 
+            password=self.password, 
+            database=self.database
+        )
+
+    def execute_query(self, query: str):
+        try:
+            return self.client.execute(query, with_column_types=True)
+        except Exception as e:
+            print(f"ClickHouse Query Error: {e}")
+            raise e
+
+    def get_schema(self):
+        query = "SELECT table, name, type FROM system.columns WHERE database = currentDatabase()"
+        try:
+            results = self.client.execute(query)
+            schema = {}
+            for row in results:
+                table = row[0]
+                if table not in schema:
+                    schema[table] = []
+                schema[table].append({"name": row[1], "type": row[2]})
+            return schema
+        except Exception as e:
+            print(f"Error getting schema: {e}")
+            return {}
+
+    def test_connection(self) -> bool:
+        try:
+            self.client.execute("SELECT 1")
+            return True
+        except Exception as e:
+            print(f"ClickHouse Connection Error: {e}")
+            return False
+
+clickhouse_connector = ClickHouseConnector()
@@ -0,0 +1,68 @@
+import duckdb
+import pandas as pd
+from typing import List, Dict, Any
+import os
+from app.core.files import resolve_upload_file_path
+
+class CSVConnector:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        if not os.path.exists(self.file_path):
+             raise FileNotFoundError(f"CSV file not found: {self.file_path}")
+
+    def _get_table_name(self) -> str:
+        # Normalize table name to be SQL safe-ish
+        base = os.path.splitext(os.path.basename(self.file_path))[0]
+        # Replace non-alphanumeric chars with underscore
+        safe_name = "".join([c if c.isalnum() else "_" for c in base])
+        # Ensure it doesn't start with a number
+        if safe_name and safe_name[0].isdigit():
+            safe_name = f"t_{safe_name}"
+        return safe_name
+
+    def execute_query(self, query: str) -> List[Dict[str, Any]]:
+        conn = duckdb.connect(":memory:")
+        table_name = self._get_table_name()
+        
+        # Register the csv file as a view
+        # read_csv_auto is powerful
+        try:
+            conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
+            
+            # Execute the user query
+            # The query should rely on the table name provided in schema
+            df = conn.execute(query).df()
+            return df.to_dict(orient="records")
+        except Exception as e:
+            print(f"CSV Query Error: {e}")
+            raise e
+        finally:
+            conn.close()
+
+    def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
+        conn = duckdb.connect(":memory:")
+        table_name = self._get_table_name()
+        
+        try:
+            conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
+            
+            # Get columns
+            columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
+            # col[0] is name, col[1] is type
+            schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
+            return schema
+        except Exception as e:
+            print(f"Error getting schema: {e}")
+            return {}
+        finally:
+            conn.close()
+
+    def test_connection(self) -> bool:
+        try:
+            conn = duckdb.connect(":memory:")
+            conn.execute(f"SELECT * FROM read_csv_auto('{self.file_path}') LIMIT 1")
+            conn.close()
+            return True
+        except Exception as e:
+            print(f"CSV Connection Error: {e}")
+            return False
@@ -0,0 +1,48 @@
+import duckdb
+import pandas as pd
+from typing import List, Dict, Any, Optional
+import os
+
+class DuckDBConnector:
+    def __init__(self, db_path: str = ":memory:"):
+        self.db_path = db_path
+
+    def execute_query(self, query: str) -> List[Dict[str, Any]]:
+        conn = duckdb.connect(self.db_path)
+        try:
+            df = conn.execute(query).df()
+            return df.to_dict(orient="records")
+        finally:
+            conn.close()
+
+    def get_schema(self) -> Dict[str, Any]:
+        conn = duckdb.connect(self.db_path)
+        try:
+            schema = {}
+            tables = conn.execute("SHOW TABLES").fetchall()
+            for (table_name,) in tables:
+                columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
+                columns = []
+                for col in columns_info:
+                    columns.append({
+                        "name": col[0],
+                        "type": col[1]
+                    })
+                schema[table_name] = {
+                    "columns": columns,
+                    "primary_keys": [], # DuckDB describe doesn't easily show PKs in this format
+                    "foreign_keys": []
+                }
+            return schema
+        finally:
+            conn.close()
+
+    def test_connection(self) -> bool:
+        try:
+            conn = duckdb.connect(self.db_path)
+            conn.execute("SELECT 1")
+            conn.close()
+            return True
+        except Exception as e:
+            print(f"DuckDB Connection Error: {e}")
+            return False
@@ -0,0 +1,82 @@
+from typing import Dict, Any, Optional
+import json
+import functools
+from app.connectors.postgres import PostgresConnector
+from app.connectors.clickhouse import ClickHouseConnector
+from app.connectors.parquet import ParquetConnector
+from app.connectors.csv import CSVConnector
+from app.connectors.duckdb import DuckDBConnector
+from app.models.datasource import DataSource
+from app.core.files import resolve_upload_file_path
+
+@functools.lru_cache(maxsize=32)
+def _get_cached_connector(ds_type: str, config_json: str):
+    config = json.loads(config_json)
+    
+    if ds_type in ["postgres", "postgresql", "supabase"]:
+        db_url = config.get("connection_string")
+        if not db_url:
+            default_port = 6543 if ds_type == "supabase" else 5432
+            port = config.get("port") or default_port
+            db_url = f"postgresql://{config.get('user')}:{config.get('password')}@{config.get('host')}:{port}/{config.get('database')}"
+            
+        if ds_type == "supabase" and "?" not in db_url:
+            db_url += "?sslmode=require"
+        elif ds_type == "supabase" and "sslmode=" not in db_url:
+            db_url += "&sslmode=require"
+            
+        return PostgresConnector(db_url=db_url)
+        
+    elif ds_type == "mysql":
+        db_url = config.get("connection_string")
+        if not db_url:
+            port = config.get("port") or 3306
+            db_url = f"mysql+pymysql://{config.get('user')}:{config.get('password')}@{config.get('host')}:{port}/{config.get('database')}"
+        elif not db_url.startswith("mysql+pymysql://"):
+            db_url = db_url.replace("mysql://", "mysql+pymysql://")
+        return PostgresConnector(db_url=db_url)
+
+    elif ds_type == "sqlite":
+        # SQLite uses connection string usually file path
+        db_url = config.get("connection_string")
+        if not db_url and config.get("file_path"):
+             file_path = str(resolve_upload_file_path(config.get("file_path")))
+             db_url = f"sqlite:///{file_path}"
+        return PostgresConnector(db_url=db_url)
+
+    elif ds_type == "clickhouse":
+        return ClickHouseConnector(
+            host=config.get("host"),
+            port=config.get("port", 9000),
+            user=config.get("user", "default"),
+            password=config.get("password", ""),
+            database=config.get("database", "default")
+        )
+        
+    elif ds_type == "duckdb":
+        db_path = config.get("database") or config.get("file_path") or ":memory:"
+        if db_path != ":memory:":
+            db_path = str(resolve_upload_file_path(db_path))
+        return DuckDBConnector(db_path=db_path)
+        
+    elif ds_type == "parquet":
+        file_path = str(resolve_upload_file_path(config.get("file_path")))
+        return ParquetConnector(file_path=file_path)
+    
+    elif ds_type == "csv":
+        file_path = str(resolve_upload_file_path(config.get("file_path")))
+        return CSVConnector(file_path=file_path)
+        
+    else:
+        raise ValueError(f"Unsupported data source type: {ds_type}")
+
+def get_connector(datasource: DataSource):
+    # Use JSON string of config as cache key
+    # Ensure stable ordering of keys
+    config_str = json.dumps(datasource.config, sort_keys=True)
+    return _get_cached_connector(datasource.type.lower(), config_str)
+
+def get_connector_from_config(ds_type: str, config: Dict[str, Any]):
+    # Helper for testing connection without saving to DB
+    config_str = json.dumps(config, sort_keys=True)
+    return _get_cached_connector(ds_type.lower(), config_str)
@@ -0,0 +1,58 @@
+import duckdb
+import pandas as pd
+from typing import List, Dict, Any
+import os
+
+class ParquetConnector:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        if not os.path.exists(self.file_path):
+             raise FileNotFoundError(f"Parquet file not found: {self.file_path}")
+
+    def execute_query(self, query: str) -> List[Dict[str, Any]]:
+        conn = duckdb.connect(":memory:")
+        # Register the parquet file as a view or table
+        # We can use read_parquet directly in query, or register it.
+        # Let's register it as 'parquet_table' for simplicity in generated SQL, 
+        # or we can ask LLM to use the filename.
+        # A better approach for generic SQL is to register it as a table name derived from filename or just 'data'.
+        table_name = os.path.splitext(os.path.basename(self.file_path))[0]
+        conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
+        
+        # If the query doesn't use the table name, we might have issues. 
+        # But usually we provide schema with table name to LLM.
+        try:
+            # DuckDB returns a dataframe, we convert to dict
+            df = conn.execute(query).df()
+            return df.to_dict(orient="records")
+        except Exception as e:
+            print(f"Parquet Query Error: {e}")
+            raise e
+        finally:
+            conn.close()
+
+    def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
+        conn = duckdb.connect(":memory:")
+        table_name = os.path.splitext(os.path.basename(self.file_path))[0]
+        conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
+        
+        try:
+            # Get columns
+            columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
+            schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
+            return schema
+        except Exception as e:
+            print(f"Error getting schema: {e}")
+            return {}
+        finally:
+            conn.close()
+
+    def test_connection(self) -> bool:
+        try:
+            conn = duckdb.connect(":memory:")
+            conn.execute(f"SELECT * FROM read_parquet('{self.file_path}') LIMIT 1")
+            conn.close()
+            return True
+        except Exception as e:
+            print(f"Parquet Connection Error: {e}")
+            return False
@@ -0,0 +1,113 @@
+from sqlalchemy import create_engine, text
+from sqlalchemy.orm import sessionmaker
+from typing import Generator
+import os
+
+class PostgresConnector:
+    def __init__(self, db_url: str = None):
+        self.db_url = db_url or os.getenv("POSTGRES_URL", "postgresql://user:password@localhost:5432/dbname")
+        self.engine = create_engine(self.db_url)
+        self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine)
+
+    def get_db(self) -> Generator:
+        db = self.SessionLocal()
+        try:
+            yield db
+        finally:
+            db.close()
+
+    def execute_query(self, query: str):
+        with self.engine.connect() as connection:
+            result = connection.execute(text(query))
+            return [dict(row._mapping) for row in result]
+
+    def get_schema(self):
+        try:
+            from sqlalchemy import inspect
+            inspector = inspect(self.engine)
+            schema = {}
+            # Default schema for postgres is 'public', sqlite is None
+            schema_name = 'public' if self.engine.dialect.name == 'postgresql' else None
+            
+            table_names = inspector.get_table_names(schema=schema_name)
+            
+            # Use SQLAlchemy 2.0+ multi-fetch to avoid N+1 queries issue, especially over remote networks
+            if hasattr(inspector, 'get_multi_columns'):
+                multi_columns = inspector.get_multi_columns(schema=schema_name)
+                multi_pk = inspector.get_multi_pk_constraint(schema=schema_name)
+                multi_fk = inspector.get_multi_foreign_keys(schema=schema_name)
+                
+                for table_name in table_names:
+                    key = (schema_name, table_name)
+                    
+                    columns = []
+                    for col in multi_columns.get(key, []):
+                        columns.append({
+                            "name": col['name'], 
+                            "type": str(col['type'])
+                        })
+                        
+                    pk_constraint = multi_pk.get(key)
+                    pks = pk_constraint.get('constrained_columns', []) if pk_constraint else []
+                    
+                    foreign_keys = []
+                    for fk in multi_fk.get(key, []):
+                        foreign_keys.append({
+                            "constrained_columns": fk['constrained_columns'],
+                            "referred_table": fk['referred_table'],
+                            "referred_columns": fk['referred_columns']
+                        })
+                        
+                    schema[table_name] = {
+                        "columns": columns,
+                        "primary_keys": pks,
+                        "foreign_keys": foreign_keys
+                    }
+                return schema
+
+            # Fallback for older SQLAlchemy versions
+            for table_name in table_names:
+                columns = []
+                # get columns
+                for col in inspector.get_columns(table_name, schema=schema_name):
+                    columns.append({
+                        "name": col['name'], 
+                        "type": str(col['type'])
+                    })
+                
+                # get primary key
+                pk_constraint = inspector.get_pk_constraint(table_name, schema=schema_name)
+                pks = pk_constraint.get('constrained_columns', []) if pk_constraint else []
+                
+                # get foreign keys
+                fks = inspector.get_foreign_keys(table_name, schema=schema_name)
+                foreign_keys = []
+                for fk in fks:
+                    foreign_keys.append({
+                        "constrained_columns": fk['constrained_columns'],
+                        "referred_table": fk['referred_table'],
+                        "referred_columns": fk['referred_columns']
+                    })
+                
+                schema[table_name] = {
+                    "columns": columns,
+                    "primary_keys": pks,
+                    "foreign_keys": foreign_keys
+                }
+            return schema
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            print(f"Error getting schema: {e}")
+            raise e
+
+    def test_connection(self) -> bool:
+        try:
+            with self.engine.connect() as connection:
+                connection.execute(text("SELECT 1"))
+            return True
+        except Exception as e:
+            print(f"PostgreSQL Connection Error: {e}")
+            raise e
+
+postgres_connector = PostgresConnector()