backend/app/connectors/csv.py

import duckdb
import pandas as pd
from typing import List, Dict, Any
import os
from app.core.files import resolve_upload_file_path

class CSVConnector:
    def __init__(self, file_path: str):
        self.file_path = file_path
        if not os.path.exists(self.file_path):
             raise FileNotFoundError(f"CSV file not found: {self.file_path}")

    def _get_table_name(self) -> str:
        # Normalize table name to be SQL safe-ish
        base = os.path.splitext(os.path.basename(self.file_path))[0]
        # Replace non-alphanumeric chars with underscore
        safe_name = "".join([c if c.isalnum() else "_" for c in base])
        # Ensure it doesn't start with a number
        if safe_name and safe_name[0].isdigit():
            safe_name = f"t_{safe_name}"
        return safe_name

    def execute_query(self, query: str) -> List[Dict[str, Any]]:
        conn = duckdb.connect(":memory:")
        table_name = self._get_table_name()
        
        # Register the csv file as a view
        # read_csv_auto is powerful
        try:
            conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
            
            # Execute the user query
            # The query should rely on the table name provided in schema
            df = conn.execute(query).df()
            return df.to_dict(orient="records")
        except Exception as e:
            print(f"CSV Query Error: {e}")
            raise e
        finally:
            conn.close()

    def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
        conn = duckdb.connect(":memory:")
        table_name = self._get_table_name()
        
        try:
            conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
            
            # Get columns
            columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
            # col[0] is name, col[1] is type
            schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
            return schema
        except Exception as e:
            print(f"Error getting schema: {e}")
            return {}
        finally:
            conn.close()

    def test_connection(self) -> bool:
        try:
            conn = duckdb.connect(":memory:")
            conn.execute(f"SELECT * FROM read_csv_auto('{self.file_path}') LIMIT 1")
            conn.close()
            return True
        except Exception as e:
            print(f"CSV Connection Error: {e}")
            return False
feat: add modelling layer 2026-03-16 22:18:23 +08:00			`import duckdb`
			`import pandas as pd`
			`from typing import List, Dict, Any`
			`import os`
			`from app.core.files import resolve_upload_file_path`

			`class CSVConnector:`
			`def __init__(self, file_path: str):`
			`self.file_path = file_path`
			`if not os.path.exists(self.file_path):`
			`raise FileNotFoundError(f"CSV file not found: {self.file_path}")`

			`def _get_table_name(self) -> str:`
			`# Normalize table name to be SQL safe-ish`
			`base = os.path.splitext(os.path.basename(self.file_path))[0]`
			`# Replace non-alphanumeric chars with underscore`
			`safe_name = "".join([c if c.isalnum() else "_" for c in base])`
			`# Ensure it doesn't start with a number`
			`if safe_name and safe_name[0].isdigit():`
			`safe_name = f"t_{safe_name}"`
			`return safe_name`

			`def execute_query(self, query: str) -> List[Dict[str, Any]]:`
			`conn = duckdb.connect(":memory:")`
			`table_name = self._get_table_name()`

			`# Register the csv file as a view`
			`# read_csv_auto is powerful`
			`try:`
			`conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")`

			`# Execute the user query`
			`# The query should rely on the table name provided in schema`
			`df = conn.execute(query).df()`
			`return df.to_dict(orient="records")`
			`except Exception as e:`
			`print(f"CSV Query Error: {e}")`
			`raise e`
			`finally:`
			`conn.close()`

			`def get_schema(self) -> Dict[str, List[Dict[str, str]]]:`
			`conn = duckdb.connect(":memory:")`
			`table_name = self._get_table_name()`

			`try:`
			`conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")`

			`# Get columns`
			`columns = conn.execute(f"DESCRIBE {table_name}").fetchall()`
			`# col[0] is name, col[1] is type`
			`schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}`
			`return schema`
			`except Exception as e:`
			`print(f"Error getting schema: {e}")`
			`return {}`
			`finally:`
			`conn.close()`

			`def test_connection(self) -> bool:`
			`try:`
			`conn = duckdb.connect(":memory:")`
			`conn.execute(f"SELECT * FROM read_csv_auto('{self.file_path}') LIMIT 1")`
			`conn.close()`
			`return True`
			`except Exception as e:`
			`print(f"CSV Connection Error: {e}")`
			`return False`