69 lines
2.5 KiB
Python
69 lines
2.5 KiB
Python
|
|
import duckdb
|
||
|
|
import pandas as pd
|
||
|
|
from typing import List, Dict, Any
|
||
|
|
import os
|
||
|
|
from app.core.files import resolve_upload_file_path
|
||
|
|
|
||
|
|
class CSVConnector:
|
||
|
|
def __init__(self, file_path: str):
|
||
|
|
self.file_path = file_path
|
||
|
|
if not os.path.exists(self.file_path):
|
||
|
|
raise FileNotFoundError(f"CSV file not found: {self.file_path}")
|
||
|
|
|
||
|
|
def _get_table_name(self) -> str:
|
||
|
|
# Normalize table name to be SQL safe-ish
|
||
|
|
base = os.path.splitext(os.path.basename(self.file_path))[0]
|
||
|
|
# Replace non-alphanumeric chars with underscore
|
||
|
|
safe_name = "".join([c if c.isalnum() else "_" for c in base])
|
||
|
|
# Ensure it doesn't start with a number
|
||
|
|
if safe_name and safe_name[0].isdigit():
|
||
|
|
safe_name = f"t_{safe_name}"
|
||
|
|
return safe_name
|
||
|
|
|
||
|
|
def execute_query(self, query: str) -> List[Dict[str, Any]]:
|
||
|
|
conn = duckdb.connect(":memory:")
|
||
|
|
table_name = self._get_table_name()
|
||
|
|
|
||
|
|
# Register the csv file as a view
|
||
|
|
# read_csv_auto is powerful
|
||
|
|
try:
|
||
|
|
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
|
||
|
|
|
||
|
|
# Execute the user query
|
||
|
|
# The query should rely on the table name provided in schema
|
||
|
|
df = conn.execute(query).df()
|
||
|
|
return df.to_dict(orient="records")
|
||
|
|
except Exception as e:
|
||
|
|
print(f"CSV Query Error: {e}")
|
||
|
|
raise e
|
||
|
|
finally:
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
|
||
|
|
conn = duckdb.connect(":memory:")
|
||
|
|
table_name = self._get_table_name()
|
||
|
|
|
||
|
|
try:
|
||
|
|
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
|
||
|
|
|
||
|
|
# Get columns
|
||
|
|
columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
||
|
|
# col[0] is name, col[1] is type
|
||
|
|
schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
|
||
|
|
return schema
|
||
|
|
except Exception as e:
|
||
|
|
print(f"Error getting schema: {e}")
|
||
|
|
return {}
|
||
|
|
finally:
|
||
|
|
conn.close()
|
||
|
|
|
||
|
|
def test_connection(self) -> bool:
|
||
|
|
try:
|
||
|
|
conn = duckdb.connect(":memory:")
|
||
|
|
conn.execute(f"SELECT * FROM read_csv_auto('{self.file_path}') LIMIT 1")
|
||
|
|
conn.close()
|
||
|
|
return True
|
||
|
|
except Exception as e:
|
||
|
|
print(f"CSV Connection Error: {e}")
|
||
|
|
return False
|