Update 2026-05-13 16:43:53

2026-05-13 16:43:53 +08:00
parent 6af5c584f4
commit afd7c5fe85
490 changed files with 850 additions and 922 deletions
@@ -0,0 +1,266 @@
+import json
+import re
+from typing import List, Dict, Any, Optional
+import time
+import sys
+from pathlib import Path
+
+# Add project root to sys.path
+PROJECT_ROOT = Path(__file__).resolve().parents[3]
+NANOBOT_ROOT = PROJECT_ROOT / "agent-core"
+if str(NANOBOT_ROOT) not in sys.path:
+    sys.path.append(str(NANOBOT_ROOT))
+
+from app.core.llm_provider import build_llm_provider
+from app.schemas.chart import ChartGenerationResponse
+from app.services.llm_cache import get_active_llm_config
+from app.trace import build_error_attributes, trace_service
+
+CHART_MAX_TOKENS = 700
+CHART_TEMPERATURE = 0.2
+CHART_REASONING_EFFORT = "low"
+
+CHART_INSTRUCTIONS = """
+### INSTRUCTIONS ###
+
+- Chart types: Bar chart, Line chart, Multi line chart, Area chart, Pie chart, Stacked bar chart, Grouped bar chart
+- You can only use the chart types provided in the instructions
+- Generated chart should answer the user's question and based on the semantics of the SQL query, and the sample data, sample column values are used to help you generate the suitable chart type
+- If the sample data is not suitable for visualization, you must return an empty string for the schema and chart type
+- If the sample data is empty, you must return an empty string for the schema and chart type
+- The language for the chart and reasoning must be the same language provided by the user
+- Please use the current time provided by the user to generate the chart
+- In order to generate the grouped bar chart, you need to follow the given instructions:
+    - Disable Stacking: Add "stack": null to the y-encoding.
+    - Use xOffset for subcategories to group bars.
+    - Don't use "transform" section.
+- In order to generate the pie chart, you need to follow the given instructions:
+    - Add {"type": "arc"} to the mark section.
+    - Add "theta" encoding to the encoding section.
+    - Add "color" encoding to the encoding section.
+    - Don't add "innerRadius" to the mark section.
+- If the x-axis of the chart is a temporal field, the time unit should be the same as the question user asked.
+    - For yearly question, the time unit should be "year".
+    - For monthly question, the time unit should be "yearmonth".
+    - For weekly question, the time unit should be "yearmonthdate".
+    - For daily question, the time unit should be "yearmonthdate".
+    - Default time unit is "yearmonth".
+- For each axis, generate the corresponding human-readable title based on the language provided by the user.
+- **CRITICAL REQUIREMENT**: Make sure all of the `field` values in the encoding section of the chart schema EXACTLY MATCH the column names of the sample data provided! DO NOT translate, rename, or hallucinate `field` names. If you want to show a translated name in the chart, use the `title` property, NOT the `field` property!
+
+### GUIDELINES TO PLOT CHART ###
+
+1. Understanding Your Data Types
+- Nominal (Categorical): Names or labels without a specific order (e.g., types of fruits, countries).
+- Ordinal: Categorical data with a meaningful order but no fixed intervals (e.g., rankings, satisfaction levels).
+- Quantitative: Numerical values representing counts or measurements (e.g., sales figures, temperatures).
+- Temporal: Date or time data (e.g., timestamps, dates).
+2. Chart Types and When to Use Them
+- Bar Chart
+    - Use When: Comparing quantities across different categories.
+    - Data Requirements:
+        - One categorical variable (x-axis).
+        - One quantitative variable (y-axis).
+    - Example: Comparing sales numbers for different product categories.
+- Grouped Bar Chart
+    - Use When: Comparing sub-categories within main categories.
+    - Data Requirements:
+        - Two categorical variables (x-axis grouped by one, color-coded by another).
+        - One quantitative variable (y-axis).
+        - Example: Sales numbers for different products across various regions.
+- Line Chart
+    - Use When: Displaying trends over continuous data, especially time.
+    - Data Requirements:
+        - One temporal or ordinal variable (x-axis).
+        - One quantitative variable (y-axis).
+    - Example: Tracking monthly revenue over a year.
+- Multi Line Chart
+    - Use When: Displaying trends over continuous data, especially time.
+    - Data Requirements:
+        - One temporal or ordinal variable (x-axis).
+        - Two or more quantitative variables (y-axis and color).
+    - Implementation Notes:
+        - Uses `transform` with `fold` to combine multiple metrics into a single series
+        - The folded metrics are distinguished using the color encoding
+    - Example: Tracking monthly click rate and read rate over a year.
+- Area Chart
+    - Use When: Similar to line charts but emphasizing the volume of change over time.
+    - Data Requirements:
+        - Same as Line Chart.
+    - Example: Visualizing cumulative rainfall over months.
+- Pie Chart
+    - Use When: Showing parts of a whole as percentages.
+    - Data Requirements:
+        - One categorical variable.
+        - One quantitative variable representing proportions.
+    - Example: Market share distribution among companies.
+- Stacked Bar Chart
+    - Use When: Showing composition and comparison across categories.
+    - Data Requirements: Same as grouped bar chart.
+    - Example: Sales by region and product type.
+"""
+
+CHART_EXAMPLES = """
+### EXAMPLES ###
+
+1. Bar Chart
+- Sample Data:
+ [
+    {"Region": "North", "Sales": 100},
+    {"Region": "South", "Sales": 200},
+    {"Region": "East", "Sales": 300},
+    {"Region": "West", "Sales": 400}
+]
+- Chart Schema:
+{
+    "title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>,
+    "mark": {"type": "bar"},
+    "encoding": {
+        "x": {"field": "Region", "type": "nominal", "title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>},
+        "y": {"field": "Sales", "type": "quantitative", "title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>},
+        "color": {"field": "Region", "type": "nominal", "title": "<TITLE_IN_LANGUAGE_PROVIDED_BY_USER>"}
+    }
+}
+2. Line Chart
+- Sample Data:
+[
+    {"Date": "2022-01-01", "Sales": 100},
+    {"Date": "2022-01-02", "Sales": 200},
+    {"Date": "2022-01-03", "Sales": 300},
+    {"Date": "2022-01-04", "Sales": 400}
+]
+- Chart Schema:
+{
+    "title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>,
+    "mark": {"type": "line"},
+    "encoding": {
+        "x": {"field": "Date", "type": "temporal", "title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>},
+        "y": {"field": "Sales", "type": "quantitative", "title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>}
+    }
+}
+"""
+
+async def generate_chart(data: List[Dict[str, Any]], query: str) -> ChartGenerationResponse:
+    trace_attributes = {
+        "component": "chart_generation",
+        "rows": len(data),
+    }
+    active_config = get_active_llm_config()
+    
+    if not active_config:
+        return ChartGenerationResponse(
+            reasoning="No active LLM configuration found",
+            can_visualize=False,
+            chart_type=""
+        )
+    
+    try:
+        provider = build_llm_provider(
+            model=active_config.get("model"),
+            provider=active_config.get("provider"),
+            api_key=active_config.get("api_key"),
+            api_base=active_config.get("api_base"),
+            extra_headers=active_config.get("extra_headers") or {},
+        )
+    except Exception as e:
+        return ChartGenerationResponse(
+            reasoning=f"Failed to initialize LLM provider: {e}",
+            can_visualize=False,
+            chart_type=""
+        )
+
+    # 2. Prepare Data Sample
+    if not data:
+        return ChartGenerationResponse(
+            reasoning="No data provided to visualize",
+            can_visualize=False,
+            chart_type=""
+        )
+
+    sample_size = 5
+    sample_data = data[:sample_size]
+    # Handle case where data might not be list of dicts
+    if isinstance(data[0], (list, tuple)):
+        # If it's a list of lists, we can't easily infer columns without more info.
+        # For now, assume it's list of dicts as per postgres/clickhouse connector expectation (formatted_results)
+        columns = [f"col_{i}" for i in range(len(data[0]))]
+    else:
+        columns = list(data[0].keys())
+    
+    # 3. Construct Prompt
+    schema_json = json.dumps(ChartGenerationResponse.model_json_schema(), ensure_ascii=False, separators=(",", ":"))
+    
+    system_prompt = f"""You are a data analyst great at visualizing data using vega-lite! Given the user's question, sample data and sample column values, you need to generate vega-lite schema in JSON and provide suitable chart type.
+Besides, you need to give a concise and easy-to-understand reasoning to describe why you provide such vega-lite schema based on the question, sample data and sample column values.
+
+{CHART_INSTRUCTIONS}
+
+{CHART_EXAMPLES}
+
+- If the user provides a custom instruction, it should be followed strictly and you should use it to change the style of response for reasoning.
+
+### OUTPUT FORMAT ###
+
+You must return a valid JSON object strictly matching the following JSON Schema:
+
+{schema_json}
+
+Please provide your chain of thought reasoning, chart type and the vega-lite schema in JSON format.
+"""
+    
+    user_prompt = f"""
+### INPUT ###
+Question: {query}
+Sample Data: {json.dumps(sample_data, ensure_ascii=False, separators=(",", ":"), default=str)}
+Sample Column Values: {columns}
+Language: Chinese (Simplified)
+"""
+
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt}
+    ]
+
+    # 4. Call LLM
+    try:
+        with trace_service.start_span(
+            "chart.generate",
+            attributes={
+                **trace_attributes,
+                "model": active_config.get("model"),
+            },
+            input_payload={"query": query, "columns": columns},
+        ) as span:
+            response = await provider.chat(
+                messages=messages,
+                max_tokens=CHART_MAX_TOKENS,
+                temperature=CHART_TEMPERATURE,
+                reasoning_effort=CHART_REASONING_EFFORT,
+            )
+            content = response.content
+            if "```json" in content:
+                content = content.split("```json")[1].split("```")[0]
+            elif "```" in content:
+                content = content.split("```")[1].split("```")[0]
+            content = content.strip()
+            result = json.loads(content)
+            chart_result = ChartGenerationResponse(**result)
+            span.set_attributes(
+                {
+                    "chart.can_visualize": bool(chart_result.can_visualize),
+                    "chart.type": chart_result.chart_type,
+                }
+            )
+            span.update(output={"chart_type": chart_result.chart_type})
+            return chart_result
+    except Exception as e:
+        with trace_service.start_span(
+            "chart.generate.error",
+            attributes={**trace_attributes, **build_error_attributes(e, stage="chart_generation")},
+        ):
+            pass
+        return ChartGenerationResponse(
+            reasoning=f"Failed to generate chart configuration: {str(e)}",
+            can_visualize=False,
+            chart_type=""
+        )
@@ -0,0 +1,720 @@
+import asyncio
+import sys
+import os
+import json
+import time
+import threading
+import logging
+from pathlib import Path
+from typing import List, Optional, Dict, Any, Callable, Awaitable
+from pydantic import BaseModel, Field
+import duckdb
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+# Add project root to sys.path to allow importing nanobot
+PROJECT_ROOT = Path(__file__).resolve().parents[3]
+NANOBOT_ROOT = PROJECT_ROOT / "agent-core"
+if str(NANOBOT_ROOT) not in sys.path:
+    sys.path.append(str(NANOBOT_ROOT))
+
+from app.core.llm_provider import build_llm_provider
+from app.connectors.postgres import postgres_connector
+from app.connectors.clickhouse import clickhouse_connector
+from app.connectors.factory import get_connector
+from app.schemas.chart import ChartGenerationResponse
+from app.agent.chart import generate_chart
+from app.database import SessionLocal
+from app.models.datasource import DataSource
+from app.core.files import resolve_upload_file_path
+from app.services.mdl import MDLService
+from app.services.llm_cache import get_active_llm_config
+from app.trace import trace_service
+
+SCHEMA_CACHE_TTL_SECONDS = 300
+CONNECTION_CACHE_TTL_SECONDS = 30
+UPLOAD_CACHE_TTL_SECONDS = 900
+MAX_UPLOAD_CACHE_ITEMS = 8
+NL2SQL_MAX_TOKENS = 900
+NL2SQL_TEMPERATURE = 0.1
+NL2SQL_REASONING_EFFORT = "low"
+NL2SQL_LLM_TIMEOUT_SECONDS = int(os.getenv("NL2SQL_LLM_TIMEOUT_SECONDS", "90"))
+NL2SQL_LLM_REQUEST_TIMEOUT_SECONDS = int(os.getenv("NL2SQL_LLM_REQUEST_TIMEOUT_SECONDS", "45"))
+NL2SQL_LLM_RETRY_COUNT = int(os.getenv("NL2SQL_LLM_RETRY_COUNT", "0"))
+NL2SQL_SQL_EXEC_TIMEOUT_SECONDS = 60
+NL2SQL_CHART_TIMEOUT_SECONDS = int(os.getenv("NL2SQL_CHART_TIMEOUT_SECONDS", "45"))
+
+_schema_cache: Dict[str, Dict[str, Any]] = {}
+_connection_cache: Dict[str, Dict[str, Any]] = {}
+_upload_cache: Dict[str, Dict[str, Any]] = {}
+_cache_lock = threading.Lock()
+
+class NL2SQLRequest(BaseModel):
+    query: str = Field(..., description="User's natural language query")
+    source: str = Field(..., description="Data source to query (postgres, clickhouse, upload, ds:{id})")
+    file_url: Optional[str] = Field(None, description="Uploaded file URL when source is upload")
+    session_id: Optional[str] = Field(None, description="Conversation session identifier")
+    generate_chart: bool = Field(False, description="Whether to generate chart specification")
+
+class NL2SQLResponse(BaseModel):
+    sql: str
+    result: List[Dict[str, Any]]
+    error: Optional[str] = None
+    chart: Optional[ChartGenerationResponse] = None
+
+# WrenAI-inspired SQL Rules
+DEFAULT_TEXT_TO_SQL_RULES = """
+### SQL RULES ###
+- ONLY USE SELECT statements, NO DELETE, UPDATE OR INSERT etc. statements that might change the data in the database.
+- ONLY USE the tables and columns mentioned in the database schema.
+- ONLY USE "*" if the user query asks for all the columns of a table.
+- ONLY CHOOSE columns belong to the tables mentioned in the database schema.
+- DON'T INCLUDE comments in the generated SQL query.
+- YOU MUST USE "JOIN" if you choose columns from multiple tables!
+- PREFER USING CTEs over subqueries.
+- When generating SQL query, always:
+    - Put double quotes around column and table names.
+    - Put single quotes around string literals.
+    - Never quote numeric literals.
+    For example: SELECT "customers"."customer_name" FROM "customers" WHERE "customers"."city" = 'Taipei' and "customers"."year" = 1992;
+- YOU MUST USE "lower(<table_name>.<column_name>) like lower(<value>)" function or "lower(<table_name>.<column_name>) = lower(<value>)" function for case-insensitive comparison!
+    - Use "lower(<table_name>.<column_name>) LIKE lower(<value>)" when:
+        - The user requests a pattern or partial match.
+        - The value is not specific enough to be a single, exact value.
+        - Wildcards (%) are needed to capture the pattern.
+    - Use "lower(<table_name>.<column_name>) = lower(<value>)" when:
+        - The user requests an exact, specific value.
+        - There is no ambiguity or pattern in the value.
+- If the column is date/time related field, and it is a INT/BIGINT/DOUBLE/FLOAT type, please use the appropriate function mentioned in the SQL FUNCTIONS section to cast the column to "TIMESTAMP" type first before using it in the query
+- ALWAYS CAST the date/time related field to "TIMESTAMP WITH TIME ZONE" type when using them in the query
+- If the user asks for a specific date, please give the date range in SQL query
+- Aggregate functions are not allowed in the WHERE clause. Instead, they belong in the HAVING clause, which is used to filter after aggregation.
+- You can only add "ORDER BY" and "LIMIT" to the final "UNION" result.
+- For the ranking problem, you must use the ranking function, `DENSE_RANK()` to rank the results and then use `WHERE` clause to filter the results.
+- For the ranking problem, you must add the ranking column to the final SELECT clause.
+"""
+
+TABLE_SELECTOR_SYSTEM_PROMPT = """
+You are a helpful assistant that identifies relevant database tables for a given natural language query.
+
+Given the list of available tables and the user's question, return a list of table names that are likely to contain the information needed to answer the question.
+
+### FINAL ANSWER FORMAT ###
+The final answer must be a JSON array of strings:
+[
+    "table_name1",
+    "table_name2"
+]
+"""
+
+SQL_GENERATION_SYSTEM_PROMPT = """
+You are a helpful assistant that converts natural language queries into ANSI SQL queries.
+
+Given user's question and database schema, generate accurate ANSI SQL directly and concisely.
+
+### GENERAL RULES ###
+
+1. YOU MUST FOLLOW the instructions strictly to generate the SQL query if the section of USER INSTRUCTIONS is available in user's input.
+2. YOU MUST FOLLOW SQL Rules if they are not contradicted with instructions.
+
+""" + DEFAULT_TEXT_TO_SQL_RULES + """
+
+### FEW-SHOT EXAMPLES ###
+Example 1:
+User's Question: 谁是去年前五个销售额最高的客户？
+Database Schema: {"customers": [{"name": "customer_id", "type": "INT"}, {"name": "customer_name", "type": "TEXT"}], "orders": [{"name": "order_id", "type": "INT"}, {"name": "customer_id", "type": "INT"}, {"name": "amount", "type": "DECIMAL"}, {"name": "order_date", "type": "DATE"}]}
+Final Answer: 
+{
+    "reasoning": "I need to join customers and orders, filter for last year (2025 if current is 2026), group by customer, sum the amount, and limit to top 5.",
+    "sql": "SELECT \\"customers\\".\\"customer_name\\", SUM(\\"orders\\".\\"amount\\") AS \\"total_sales\\" FROM \\"customers\\" JOIN \\"orders\\" ON \\"customers\\".\\"customer_id\\" = \\"orders\\".\\"customer_id\\" WHERE \\"orders\\".\\"order_date\" BETWEEN '2025-01-01' AND '2025-12-31' GROUP BY \\"customers\\".\\"customer_name\\" ORDER BY \\"total_sales\\" DESC LIMIT 5;"
+}
+
+### FINAL ANSWER FORMAT ###
+The final answer must be a ANSI SQL query in JSON format:
+
+{
+    "reasoning": <STEP_BY_STEP_REASONING_PLAN>,
+    "sql": <SQL_QUERY_STRING>
+}
+"""
+
+def _resolve_upload_file_path(file_url: Optional[str]) -> Path:
+    try:
+        return resolve_upload_file_path(file_url)
+    except ValueError as e:
+        raise ValueError(f"Invalid uploaded file URL: {e}")
+
+def _load_upload_dataframe_from_path(file_path: Path) -> pd.DataFrame:
+    suffix = file_path.suffix.lower()
+    if suffix == ".csv":
+        return pd.read_csv(file_path)
+    if suffix in [".xls", ".xlsx"]:
+        return pd.read_excel(file_path)
+    if suffix == ".parquet":
+        return pd.read_parquet(file_path)
+    raise ValueError(f"Unsupported uploaded file type: {suffix}")
+
+def _build_upload_schema(df: pd.DataFrame) -> Dict[str, List[Dict[str, str]]]:
+    conn = duckdb.connect(":memory:")
+    conn.register("uploaded_file", df)
+    columns = conn.execute("DESCRIBE uploaded_file").fetchall()
+    schema = {"uploaded_file": [{"name": col[0], "type": col[1]} for col in columns]}
+    conn.close()
+    return schema
+
+def _get_upload_payload(file_url: Optional[str]) -> Dict[str, Any]:
+    file_path = _resolve_upload_file_path(file_url)
+    stat = file_path.stat()
+    cache_key = f"{file_path}:{int(stat.st_mtime)}:{stat.st_size}"
+    now = time.time()
+    with _cache_lock:
+        cached = _upload_cache.get(cache_key)
+        if cached and now < cached["expires_at"]:
+            return {"df": cached["df"], "schema": cached["schema"]}
+    df = _load_upload_dataframe_from_path(file_path)
+    schema = _build_upload_schema(df)
+    with _cache_lock:
+        if len(_upload_cache) >= MAX_UPLOAD_CACHE_ITEMS:
+            oldest_key = min(_upload_cache.keys(), key=lambda key: _upload_cache[key]["expires_at"])
+            _upload_cache.pop(oldest_key, None)
+        _upload_cache[cache_key] = {
+            "df": df,
+            "schema": schema,
+            "expires_at": now + UPLOAD_CACHE_TTL_SECONDS,
+        }
+    return {"df": df, "schema": schema}
+
+def _execute_upload_sql(sql_query: str, df: pd.DataFrame) -> List[Dict[str, Any]]:
+    conn = duckdb.connect(":memory:")
+    conn.register("uploaded_file", df)
+    result_df = conn.execute(sql_query).df()
+    conn.close()
+    return result_df.to_dict(orient="records")
+
+def _to_number(value: Any) -> Optional[float]:
+    if isinstance(value, bool):
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    if isinstance(value, str):
+        text = value.strip().replace(",", "")
+        if not text:
+            return None
+        try:
+            return float(text)
+        except ValueError:
+            return None
+    return None
+
+# _build_fallback_chart removed as per user request to not hardcode fallbacks
+
+def _build_schema_cache_key(source: str, connector: Any) -> str:
+    # If source is ds:ID, that's already a good key
+    if source.startswith("ds:"):
+        return source
+        
+    if source == "postgres":
+        return f"postgres:{getattr(connector, 'db_url', '')}"
+    if source == "clickhouse":
+        return (
+            f"clickhouse:{getattr(connector, 'host', '')}:{getattr(connector, 'port', '')}:"
+            f"{getattr(connector, 'user', '')}:{getattr(connector, 'database', '')}"
+        )
+    return source
+
+def _get_cached_schema(source: str, connector: Any) -> Optional[Dict[str, List[Dict[str, str]]]]:
+    key = _build_schema_cache_key(source, connector)
+    now = time.time()
+    with _cache_lock:
+        cached = _schema_cache.get(key)
+        if cached and now < cached["expires_at"]:
+            return cached["schema"]
+    return None
+
+def _set_cached_schema(source: str, connector: Any, schema: Dict[str, List[Dict[str, str]]]) -> None:
+    key = _build_schema_cache_key(source, connector)
+    with _cache_lock:
+        _schema_cache[key] = {"schema": schema, "expires_at": time.time() + SCHEMA_CACHE_TTL_SECONDS}
+
+async def _check_connection_with_cache(source: str, connector: Any) -> bool:
+    cache_key = _build_schema_cache_key(source, connector)
+    now = time.time()
+    with _cache_lock:
+        cached = _connection_cache.get(cache_key)
+        if cached and now < cached["expires_at"]:
+            return bool(cached["ok"])
+    
+    # Run synchronous test_connection in a separate thread to avoid blocking event loop
+    try:
+        ok = await asyncio.wait_for(
+            asyncio.to_thread(connector.test_connection),
+            timeout=15.0
+        )
+    except asyncio.TimeoutError:
+        print("Connection test failed or timed out: Timeout after 15 seconds")
+        ok = False
+    except Exception as e:
+        print(f"Connection test failed or timed out: {e}")
+        ok = False
+    
+    with _cache_lock:
+        _connection_cache[cache_key] = {"ok": ok, "expires_at": now + CONNECTION_CACHE_TTL_SECONDS}
+    return ok
+
+async def _select_relevant_tables(
+    query: str, 
+    schema: Dict[str, Any], 
+    provider: Any, 
+    on_progress: Callable[[str], Awaitable[None]] | None = None
+) -> List[str]:
+    """Use LLM to select relevant tables from the schema to reduce context size."""
+    table_names = list(schema.keys())
+    if len(table_names) <= 5:
+        return table_names
+
+    if on_progress:
+        await on_progress("正在进行语义表搜索")
+
+    user_prompt = f"User's Question: {query}\nAvailable Tables: {', '.join(table_names)}"
+    messages = [
+        {"role": "system", "content": TABLE_SELECTOR_SYSTEM_PROMPT},
+        {"role": "user", "content": user_prompt}
+    ]
+
+    try:
+        response = await asyncio.wait_for(
+            provider.chat(
+                messages=messages,
+                max_tokens=200,
+                temperature=0.0,
+            ),
+            timeout=30.0
+        )
+        content = (response.content or "").strip()
+        if "```json" in content:
+            content = content.split("```json")[1].split("```")[0]
+        elif "```" in content:
+            content = content.split("```")[1].split("```")[0]
+        
+        selected = json.loads(content.strip())
+        if isinstance(selected, list):
+            # Filter valid table names
+            valid_selected = [t for t in selected if t in table_names]
+            if valid_selected:
+                return valid_selected
+    except Exception as e:
+        logger.warning(f"Table selection failed: {e}")
+    
+    return table_names
+
+async def _fetch_sample_data(
+    connector: Any, 
+    table_names: List[str], 
+    on_progress: Callable[[str], Awaitable[None]] | None = None
+) -> Dict[str, List[Dict[str, Any]]]:
+    """Fetch sample rows for selected tables to help LLM understand data."""
+    samples = {}
+    if not connector or not hasattr(connector, "execute_query"):
+        return samples
+
+    if on_progress:
+        await on_progress(f"正在抓取 {len(table_names)} 張表的樣本數據")
+
+    for table in table_names:
+        try:
+            # We use a very small limit
+            query = f"SELECT * FROM \"{table}\" LIMIT 3"
+            results = await asyncio.wait_for(
+                asyncio.to_thread(connector.execute_query, query),
+                timeout=10.0
+            )
+            
+            rows = []
+            if isinstance(results, list):
+                if results and isinstance(results[0], dict):
+                    rows = results
+                elif results and isinstance(results[0], (list, tuple)):
+                    rows = [list(row) for row in results]
+                else:
+                    rows = results
+            elif isinstance(results, tuple) and len(results) == 2:
+                rows_raw, cols = results
+                col_names = [c[0] for c in cols]
+                rows = [dict(zip(col_names, row)) for row in rows_raw]
+            
+            if rows:
+                samples[table] = rows
+        except Exception as e:
+            logger.warning(f"Failed to fetch sample for {table}: {e}")
+            
+    return samples
+
+async def process_nl2sql(
+    request: NL2SQLRequest,
+    on_progress: Callable[[str], Awaitable[None]] | None = None,
+) -> NL2SQLResponse:
+    async def emit_progress(content: str) -> None:
+        if on_progress and content:
+            await on_progress(content)
+
+    total_started = time.perf_counter()
+    trace_base_attributes = {
+        "component": "nl2sql",
+        "source": request.source,
+        "session_id": request.session_id,
+        "generate_chart": request.generate_chart,
+    }
+    # 1. Get the connector and schema
+    connector = None
+    schema = {}
+    upload_df: Optional[pd.DataFrame] = None
+    
+    if request.source == "postgres":
+        connector = postgres_connector
+    elif request.source == "clickhouse":
+        connector = clickhouse_connector
+    elif request.source == "upload":
+        try:
+            upload_started = time.perf_counter()
+            upload_payload = await asyncio.to_thread(_get_upload_payload, request.file_url)
+            upload_df = upload_payload["df"]
+            schema = upload_payload["schema"]
+            await emit_progress(f"上传文件加载完成 ({time.perf_counter() - upload_started:.2f}s)")
+        except Exception as e:
+            return NL2SQLResponse(sql="", result=[], error=f"Failed to load uploaded file: {e}")
+    elif request.source.startswith("ds:"):
+        try:
+            ds_started = time.perf_counter()
+            ds_id = int(request.source.split(":")[1])
+            
+            def _get_ds_connector():
+                db = SessionLocal()
+                try:
+                    ds = db.query(DataSource).filter(DataSource.id == ds_id).first()
+                    if not ds:
+                        return None
+                    return get_connector(ds)
+                finally:
+                    db.close()
+            
+            connector = await asyncio.to_thread(_get_ds_connector)
+            if not connector:
+                return NL2SQLResponse(sql="", result=[], error=f"Data source not found: {request.source}")
+                
+            await emit_progress(f"数据源配置读取完成 ({time.perf_counter() - ds_started:.2f}s)")
+        except ValueError:
+             return NL2SQLResponse(sql="", result=[], error=f"Invalid data source ID: {request.source}")
+        except Exception as e:
+             return NL2SQLResponse(sql="", result=[], error=f"Failed to load data source: {e}")
+    else:
+        return NL2SQLResponse(sql="", result=[], error=f"Unsupported data source: {request.source}")
+
+    if connector:
+        await emit_progress("正在检测数据源连通性")
+        cached_schema = _get_cached_schema(request.source, connector)
+        if cached_schema is not None:
+            schema = cached_schema
+            await emit_progress(f"命中 Schema 缓存，已加载 {len(schema)} 张表")
+        else:
+            conn_started = time.perf_counter()
+            if not await _check_connection_with_cache(request.source, connector):
+                return NL2SQLResponse(sql="", result=[], error=f"Failed to connect to {request.source}")
+            await emit_progress(f"连接检测完成 ({time.perf_counter() - conn_started:.2f}s)")
+            schema_started = time.perf_counter()
+            try:
+                schema = await asyncio.wait_for(
+                    asyncio.to_thread(connector.get_schema),
+                    timeout=120.0
+                )
+            except asyncio.TimeoutError:
+                return NL2SQLResponse(sql="", result=[], error="Failed to fetch schema: Timeout after 120 seconds. Data source might be too large or network is slow.")
+            except Exception as e:
+                return NL2SQLResponse(sql="", result=[], error=f"Failed to fetch schema: {e}")
+            
+            _set_cached_schema(request.source, connector, schema)
+            await emit_progress(f"Schema 拉取完成，共 {len(schema)} 张表 ({time.perf_counter() - schema_started:.2f}s)")
+
+    # 2. Get the active LLM config
+    active_config = get_active_llm_config()
+    
+    if not active_config:
+        return NL2SQLResponse(sql="", result=[], error="No active LLM configuration found")
+
+    # 3. Initialize Provider
+    try:
+        provider = build_llm_provider(
+            model=active_config.get("model"),
+            provider=active_config.get("provider"),
+            api_key=active_config.get("api_key"),
+            api_base=active_config.get("api_base"),
+            extra_headers=active_config.get("extra_headers") or {},
+        )
+    except Exception as e:
+        return NL2SQLResponse(sql="", result=[], error=f"Failed to initialize LLM provider: {e}")
+
+    # 4. Table Selection and Sample Data (Optimization)
+    relevant_tables = await _select_relevant_tables(request.query, schema, provider, emit_progress)
+    pruned_schema = {t: schema[t] for t in relevant_tables if t in schema}
+    
+    samples = {}
+    if request.source != "upload": # For upload, df is already in memory and small
+        samples = await _fetch_sample_data(connector, relevant_tables, emit_progress)
+    
+    schema_str = json.dumps(pruned_schema, ensure_ascii=False, separators=(",", ":"))
+    samples_str = json.dumps(samples, ensure_ascii=False, separators=(",", ":")) if samples else ""
+
+    # Try to load MDL context
+    mdl_context = ""
+    if request.source.startswith("ds:"):
+        try:
+            ds_id = int(request.source.split(":")[1])
+            mdl = await asyncio.to_thread(MDLService.get_mdl, ds_id)
+            if mdl:
+                mdl_lines = ["\n### SEMANTIC MODEL (WrenMDL) ###"]
+                
+                mdl_lines.append("MODELS:")
+                for model in mdl.models:
+                    # Only include relevant models
+                    if model.name not in relevant_tables and (model.tableReference and model.tableReference.table not in relevant_tables):
+                        continue
+
+                    table_ref = model.tableReference.table if model.tableReference else model.name
+                    desc = f" - Description: {model.properties.get('description', '')}" if model.properties.get('description') else ""
+                    mdl_lines.append(f"- Model: {model.name} (Table: {table_ref}){desc}")
+                    
+                    if model.columns:
+                        mdl_lines.append("  Columns:")
+                        for col in model.columns:
+                            col_desc = f" ({col.properties.get('description')})" if col.properties.get('description') else ""
+                            expr = f" [Calculated: {col.expression}]" if col.isCalculated else ""
+                            mdl_lines.append(f"    - {col.name} ({col.type}){col_desc}{expr}")
+                
+                if mdl.relationships:
+                    mdl_lines.append("\nRELATIONSHIPS:")
+                    for rel in mdl.relationships:
+                        # Only include relevant relationships
+                        rel_models = rel.models if isinstance(rel.models, list) else []
+                        if any(m in relevant_tables for m in rel_models):
+                            mdl_lines.append(f"- {rel.name}: {rel.joinType} between {rel.models} ON {rel.condition}")
+                
+                mdl_context = "\n".join(mdl_lines)
+        except Exception as e:
+            print(f"Failed to load MDL: {e}")
+
+    # 5. Construct Prompt
+    user_prompt = f"""
+### DATABASE SCHEMA ###
+{schema_str}
+{mdl_context}
+
+### SAMPLE DATA ###
+{samples_str}
+
+### INPUTS ###
+User's Question: {request.query}
+Language: Chinese (Simplified)
+"""
+
+    messages = [
+        {"role": "system", "content": SQL_GENERATION_SYSTEM_PROMPT},
+        {"role": "user", "content": user_prompt}
+    ]
+
+    # 5. Call LLM & 6. Execute SQL (with Self-Correction Loop)
+    MAX_SQL_EXEC_RETRIES = int(os.getenv("NL2SQL_MAX_EXEC_RETRIES", "2"))
+    sql_query = ""
+    formatted_results = []
+    chart_response = None
+    timeout_stage = "llm_generation"
+
+    for exec_attempt in range(MAX_SQL_EXEC_RETRIES + 1):
+        try:
+            llm_started = time.perf_counter()
+            if exec_attempt == 0:
+                await emit_progress("正在生成 SQL")
+            else:
+                await emit_progress(f"正在尝试修复 SQL ({exec_attempt}/{MAX_SQL_EXEC_RETRIES})")
+                
+            response = None
+            last_error = ""
+
+            for attempt in range(NL2SQL_LLM_RETRY_COUNT + 1):
+                try:
+                    with trace_service.start_span(
+                        "nl2sql.llm_generation",
+                        attributes={
+                            **trace_base_attributes,
+                            "exec_attempt": exec_attempt,
+                            "retry_attempt": attempt,
+                            "model": active_config.get("model"),
+                        },
+                    ) as llm_span:
+                        response = await asyncio.wait_for(
+                            provider.chat(
+                                messages=messages,
+                                max_tokens=NL2SQL_MAX_TOKENS,
+                                temperature=NL2SQL_TEMPERATURE,
+                                reasoning_effort=NL2SQL_REASONING_EFFORT,
+                            ),
+                            timeout=NL2SQL_LLM_TIMEOUT_SECONDS,
+                        )
+                        llm_span.update(output={"finish_reason": getattr(response, "finish_reason", None)})
+                except asyncio.TimeoutError:
+                    last_error = f"LLM generation timeout after {NL2SQL_LLM_TIMEOUT_SECONDS}s"
+                    if attempt < NL2SQL_LLM_RETRY_COUNT:
+                        await emit_progress(f"SQL 生成超时，正在重试 ({attempt + 1}/{NL2SQL_LLM_RETRY_COUNT})")
+                        continue
+                    return NL2SQLResponse(sql=sql_query, result=[], error=last_error)
+                except Exception as e:
+                    last_error = f"LLM generation failed: {e}"
+                    if attempt < NL2SQL_LLM_RETRY_COUNT:
+                        await emit_progress(f"SQL 生成失败，正在重试 ({attempt + 1}/{NL2SQL_LLM_RETRY_COUNT})")
+                        continue
+                    return NL2SQLResponse(sql=sql_query, result=[], error=last_error)
+
+                if response.finish_reason == "error":
+                    last_error = response.content or "LLM Error"
+                    if attempt < NL2SQL_LLM_RETRY_COUNT:
+                        await emit_progress(f"模型返回错误，正在重试 ({attempt + 1}/{NL2SQL_LLM_RETRY_COUNT})")
+                        continue
+                    return NL2SQLResponse(sql=sql_query, result=[], error=last_error)
+                break
+
+            if response is None:
+                return NL2SQLResponse(sql=sql_query, result=[], error=last_error or "LLM generation failed")
+
+            content = (response.content or "").strip()
+            if not content:
+                return NL2SQLResponse(sql=sql_query, result=[], error="LLM returned empty response")
+
+            # Clean up code blocks
+            if "```json" in content:
+                content = content.split("```json")[1].split("```")[0]
+            elif "```" in content:
+                content = content.split("```")[1].split("```")[0]
+
+            content = content.strip()
+
+            try:
+                result_json = json.loads(content)
+                sql_query = result_json.get("sql", "").strip()
+            except json.JSONDecodeError:
+                # Fallback if LLM doesn't return valid JSON despite instructions
+                sql_query = content
+                
+            logger.info(f"Generated SQL for query '{request.query}':\n{sql_query}")
+            
+            # 格式化单行 SQL 用于在前端进度中展示
+            formatted_sql = sql_query.replace('\n', ' ')
+            if len(formatted_sql) > 150:
+                formatted_sql = formatted_sql[:147] + "..."
+            await emit_progress(f"SQL 生成完成: {formatted_sql}")
+
+        except Exception as e:
+            return NL2SQLResponse(sql=sql_query, result=[], error=f"LLM generation failed: {e}")
+
+        # 6. Execute SQL
+        try:
+            timeout_stage = "sql_execution"
+            sql_exec_started = time.perf_counter()
+            await emit_progress("正在执行 SQL 查询")
+            with trace_service.start_span(
+                "nl2sql.sql_execution",
+                attributes={
+                    **trace_base_attributes,
+                    "exec_attempt": exec_attempt,
+                },
+                input_payload={"sql": sql_query},
+            ) as sql_span:
+                if request.source == "upload":
+                    if upload_df is None:
+                        upload_payload = await asyncio.to_thread(_get_upload_payload, request.file_url)
+                        upload_df = upload_payload["df"]
+                    formatted_results = await asyncio.wait_for(
+                        asyncio.to_thread(_execute_upload_sql, sql_query, upload_df),
+                        timeout=NL2SQL_SQL_EXEC_TIMEOUT_SECONDS,
+                    )
+                else:
+                    results = await asyncio.wait_for(
+                        asyncio.to_thread(connector.execute_query, sql_query),
+                        timeout=NL2SQL_SQL_EXEC_TIMEOUT_SECONDS,
+                    )
+                    formatted_results = []
+                    if isinstance(results, list):
+                        if results and isinstance(results[0], dict):
+                            formatted_results = results
+                        elif results and isinstance(results[0], (list, tuple)):
+                            formatted_results = [list(row) for row in results]
+                        else:
+                            formatted_results = results
+                    elif isinstance(results, tuple) and len(results) == 2:
+                        rows, cols = results
+                        col_names = [c[0] for c in cols]
+                        formatted_results = [dict(zip(col_names, row)) for row in rows]
+                    else:
+                        formatted_results = []
+                sql_span.set_attributes({"result_rows": len(formatted_results)})
+                     
+            await emit_progress(f"SQL 执行完成，返回 {len(formatted_results)} 行 ({time.perf_counter() - sql_exec_started:.2f}s)")
+            break # Execution succeeded, break the retry loop
+
+        except asyncio.TimeoutError:
+            return NL2SQLResponse(sql=sql_query, result=[], error=f"SQL execution timeout after {NL2SQL_SQL_EXEC_TIMEOUT_SECONDS}s")
+        except Exception as e:
+            if exec_attempt < MAX_SQL_EXEC_RETRIES:
+                await emit_progress(f"SQL 执行失败，准备自动修复 ({exec_attempt + 1}/{MAX_SQL_EXEC_RETRIES})")
+                messages.append({"role": "assistant", "content": f"```json\n{{\"sql\": \"{sql_query}\"}}\n```"})
+                messages.append({
+                    "role": "user", 
+                    "content": f"The generated SQL failed to execute. Database error:\n{str(e)}\n\nPlease fix the SQL query to resolve this error and provide the corrected version following the exact same JSON format."
+                })
+                continue
+            else:
+                return NL2SQLResponse(sql=sql_query, result=[], error=f"SQL execution failed after {MAX_SQL_EXEC_RETRIES} retries: {e}")
+
+    # 7. Generate Chart
+    if request.generate_chart and formatted_results:
+        try:
+            chart_started = time.perf_counter()
+            await emit_progress("正在生成可视化方案")
+            timeout_stage = "chart_generation"
+            with trace_service.start_span(
+                "nl2sql.chart_generation",
+                attributes=trace_base_attributes,
+                input_payload={"query": request.query, "rows": len(formatted_results)},
+            ) as chart_span:
+                chart_response = await asyncio.wait_for(
+                    generate_chart(formatted_results, request.query),
+                    timeout=NL2SQL_CHART_TIMEOUT_SECONDS,
+                )
+                chart_span.set_attributes(
+                    {
+                        "chart.can_visualize": bool(getattr(chart_response, "can_visualize", False)),
+                        "chart.type": getattr(chart_response, "chart_type", ""),
+                    }
+                )
+            await emit_progress(f"可视化方案生成完成 ({time.perf_counter() - chart_started:.2f}s)")
+        except asyncio.TimeoutError:
+            fallback_chart = ChartGenerationResponse(
+                reasoning=f"Chart generation timeout after {NL2SQL_CHART_TIMEOUT_SECONDS}s",
+                chart_type="",
+                can_visualize=False,
+                chart_spec=None,
+            )
+            return NL2SQLResponse(sql=sql_query, result=formatted_results, chart=fallback_chart)
+        except Exception as e:
+            pass # Ignore chart generation errors, return data only
+
+    with trace_service.start_span(
+        "nl2sql.completed",
+        attributes={
+            **trace_base_attributes,
+            "total_seconds": round(time.perf_counter() - total_started, 4),
+            "result_rows": len(formatted_results),
+            "has_chart": bool(chart_response),
+        },
+    ):
+        pass
+    await emit_progress(f"NL2SQL 总耗时 {time.perf_counter() - total_started:.2f}s")
+    return NL2SQLResponse(sql=sql_query, result=formatted_results, chart=chart_response)