feat: add modelling layer

2026-03-16 22:18:23 +08:00
parent a1a855a126
commit 720c30a893
16 changed files with 1115 additions and 106 deletions
@@ -33,7 +33,7 @@ class ClickHouseConnector:
                table = row[0]
                if table not in schema:
                    schema[table] = []
-                schema[table].append(f"{row[1]} ({row[2]})")
+                schema[table].append({"name": row[1], "type": row[2]})
            return schema
        except Exception as e:
            print(f"Error getting schema: {e}")
@@ -0,0 +1,68 @@
+import duckdb
+import pandas as pd
+from typing import List, Dict, Any
+import os
+from app.core.files import resolve_upload_file_path
+
+class CSVConnector:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+        if not os.path.exists(self.file_path):
+             raise FileNotFoundError(f"CSV file not found: {self.file_path}")
+
+    def _get_table_name(self) -> str:
+        # Normalize table name to be SQL safe-ish
+        base = os.path.splitext(os.path.basename(self.file_path))[0]
+        # Replace non-alphanumeric chars with underscore
+        safe_name = "".join([c if c.isalnum() else "_" for c in base])
+        # Ensure it doesn't start with a number
+        if safe_name and safe_name[0].isdigit():
+            safe_name = f"t_{safe_name}"
+        return safe_name
+
+    def execute_query(self, query: str) -> List[Dict[str, Any]]:
+        conn = duckdb.connect(":memory:")
+        table_name = self._get_table_name()
+        
+        # Register the csv file as a view
+        # read_csv_auto is powerful
+        try:
+            conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
+            
+            # Execute the user query
+            # The query should rely on the table name provided in schema
+            df = conn.execute(query).df()
+            return df.to_dict(orient="records")
+        except Exception as e:
+            print(f"CSV Query Error: {e}")
+            raise e
+        finally:
+            conn.close()
+
+    def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
+        conn = duckdb.connect(":memory:")
+        table_name = self._get_table_name()
+        
+        try:
+            conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
+            
+            # Get columns
+            columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
+            # col[0] is name, col[1] is type
+            schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
+            return schema
+        except Exception as e:
+            print(f"Error getting schema: {e}")
+            return {}
+        finally:
+            conn.close()
+
+    def test_connection(self) -> bool:
+        try:
+            conn = duckdb.connect(":memory:")
+            conn.execute(f"SELECT * FROM read_csv_auto('{self.file_path}') LIMIT 1")
+            conn.close()
+            return True
+        except Exception as e:
+            print(f"CSV Connection Error: {e}")
+            return False
@@ -4,6 +4,7 @@ import functools
 from app.connectors.postgres import PostgresConnector
 from app.connectors.clickhouse import ClickHouseConnector
 from app.connectors.parquet import ParquetConnector
+from app.connectors.csv import CSVConnector
 from app.models.datasource import DataSource
 from app.core.files import resolve_upload_file_path

@@ -37,6 +38,10 @@ def _get_cached_connector(ds_type: str, config_json: str):
    elif ds_type == "parquet":
        file_path = str(resolve_upload_file_path(config.get("file_path")))
        return ParquetConnector(file_path=file_path)
+    
+    elif ds_type == "csv":
+        file_path = str(resolve_upload_file_path(config.get("file_path")))
+        return CSVConnector(file_path=file_path)
        
    else:
        raise ValueError(f"Unsupported data source type: {ds_type}")
@@ -31,7 +31,7 @@ class ParquetConnector:
        finally:
            conn.close()

-    def get_schema(self) -> Dict[str, List[str]]:
+    def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
        conn = duckdb.connect(":memory:")
        table_name = os.path.splitext(os.path.basename(self.file_path))[0]
        conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
@@ -39,7 +39,7 @@ class ParquetConnector:
        try:
            # Get columns
            columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
-            schema = {table_name: [f"{col[0]} ({col[1]})" for col in columns]}
+            schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
            return schema
        except Exception as e:
            print(f"Error getting schema: {e}")
@@ -22,6 +22,9 @@ class PostgresConnector:
            return [dict(row._mapping) for row in result]

    def get_schema(self):
+        if self.engine.dialect.name == "sqlite":
+            return self._get_sqlite_schema()
+
        query = """
        SELECT table_name, column_name, data_type
        FROM information_schema.columns
@@ -35,12 +38,27 @@ class PostgresConnector:
                table = row['table_name']
                if table not in schema:
                    schema[table] = []
-                schema[table].append(f"{row['column_name']} ({row['data_type']})")
+                schema[table].append({"name": row['column_name'], "type": row['data_type']})
            return schema
        except Exception as e:
            print(f"Error getting schema: {e}")
            return {}

+    def _get_sqlite_schema(self):
+        try:
+            from sqlalchemy import inspect
+            inspector = inspect(self.engine)
+            schema = {}
+            for table_name in inspector.get_table_names():
+                columns = []
+                for col in inspector.get_columns(table_name):
+                    columns.append({"name": col['name'], "type": str(col['type'])})
+                schema[table_name] = columns
+            return schema
+        except Exception as e:
+            print(f"Error getting SQLite schema: {e}")
+            return {}
+
    def test_connection(self) -> bool:
        try:
            with self.engine.connect() as connection: