Update 2026-05-13 16:43:53
This commit is contained in:
@@ -0,0 +1,266 @@
|
||||
import json
|
||||
import re
|
||||
from typing import List, Dict, Any, Optional
|
||||
import time
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add project root to sys.path
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
||||
NANOBOT_ROOT = PROJECT_ROOT / "agent-core"
|
||||
if str(NANOBOT_ROOT) not in sys.path:
|
||||
sys.path.append(str(NANOBOT_ROOT))
|
||||
|
||||
from app.core.llm_provider import build_llm_provider
|
||||
from app.schemas.chart import ChartGenerationResponse
|
||||
from app.services.llm_cache import get_active_llm_config
|
||||
from app.trace import build_error_attributes, trace_service
|
||||
|
||||
CHART_MAX_TOKENS = 700
|
||||
CHART_TEMPERATURE = 0.2
|
||||
CHART_REASONING_EFFORT = "low"
|
||||
|
||||
CHART_INSTRUCTIONS = """
|
||||
### INSTRUCTIONS ###
|
||||
|
||||
- Chart types: Bar chart, Line chart, Multi line chart, Area chart, Pie chart, Stacked bar chart, Grouped bar chart
|
||||
- You can only use the chart types provided in the instructions
|
||||
- Generated chart should answer the user's question and based on the semantics of the SQL query, and the sample data, sample column values are used to help you generate the suitable chart type
|
||||
- If the sample data is not suitable for visualization, you must return an empty string for the schema and chart type
|
||||
- If the sample data is empty, you must return an empty string for the schema and chart type
|
||||
- The language for the chart and reasoning must be the same language provided by the user
|
||||
- Please use the current time provided by the user to generate the chart
|
||||
- In order to generate the grouped bar chart, you need to follow the given instructions:
|
||||
- Disable Stacking: Add "stack": null to the y-encoding.
|
||||
- Use xOffset for subcategories to group bars.
|
||||
- Don't use "transform" section.
|
||||
- In order to generate the pie chart, you need to follow the given instructions:
|
||||
- Add {"type": "arc"} to the mark section.
|
||||
- Add "theta" encoding to the encoding section.
|
||||
- Add "color" encoding to the encoding section.
|
||||
- Don't add "innerRadius" to the mark section.
|
||||
- If the x-axis of the chart is a temporal field, the time unit should be the same as the question user asked.
|
||||
- For yearly question, the time unit should be "year".
|
||||
- For monthly question, the time unit should be "yearmonth".
|
||||
- For weekly question, the time unit should be "yearmonthdate".
|
||||
- For daily question, the time unit should be "yearmonthdate".
|
||||
- Default time unit is "yearmonth".
|
||||
- For each axis, generate the corresponding human-readable title based on the language provided by the user.
|
||||
- **CRITICAL REQUIREMENT**: Make sure all of the `field` values in the encoding section of the chart schema EXACTLY MATCH the column names of the sample data provided! DO NOT translate, rename, or hallucinate `field` names. If you want to show a translated name in the chart, use the `title` property, NOT the `field` property!
|
||||
|
||||
### GUIDELINES TO PLOT CHART ###
|
||||
|
||||
1. Understanding Your Data Types
|
||||
- Nominal (Categorical): Names or labels without a specific order (e.g., types of fruits, countries).
|
||||
- Ordinal: Categorical data with a meaningful order but no fixed intervals (e.g., rankings, satisfaction levels).
|
||||
- Quantitative: Numerical values representing counts or measurements (e.g., sales figures, temperatures).
|
||||
- Temporal: Date or time data (e.g., timestamps, dates).
|
||||
2. Chart Types and When to Use Them
|
||||
- Bar Chart
|
||||
- Use When: Comparing quantities across different categories.
|
||||
- Data Requirements:
|
||||
- One categorical variable (x-axis).
|
||||
- One quantitative variable (y-axis).
|
||||
- Example: Comparing sales numbers for different product categories.
|
||||
- Grouped Bar Chart
|
||||
- Use When: Comparing sub-categories within main categories.
|
||||
- Data Requirements:
|
||||
- Two categorical variables (x-axis grouped by one, color-coded by another).
|
||||
- One quantitative variable (y-axis).
|
||||
- Example: Sales numbers for different products across various regions.
|
||||
- Line Chart
|
||||
- Use When: Displaying trends over continuous data, especially time.
|
||||
- Data Requirements:
|
||||
- One temporal or ordinal variable (x-axis).
|
||||
- One quantitative variable (y-axis).
|
||||
- Example: Tracking monthly revenue over a year.
|
||||
- Multi Line Chart
|
||||
- Use When: Displaying trends over continuous data, especially time.
|
||||
- Data Requirements:
|
||||
- One temporal or ordinal variable (x-axis).
|
||||
- Two or more quantitative variables (y-axis and color).
|
||||
- Implementation Notes:
|
||||
- Uses `transform` with `fold` to combine multiple metrics into a single series
|
||||
- The folded metrics are distinguished using the color encoding
|
||||
- Example: Tracking monthly click rate and read rate over a year.
|
||||
- Area Chart
|
||||
- Use When: Similar to line charts but emphasizing the volume of change over time.
|
||||
- Data Requirements:
|
||||
- Same as Line Chart.
|
||||
- Example: Visualizing cumulative rainfall over months.
|
||||
- Pie Chart
|
||||
- Use When: Showing parts of a whole as percentages.
|
||||
- Data Requirements:
|
||||
- One categorical variable.
|
||||
- One quantitative variable representing proportions.
|
||||
- Example: Market share distribution among companies.
|
||||
- Stacked Bar Chart
|
||||
- Use When: Showing composition and comparison across categories.
|
||||
- Data Requirements: Same as grouped bar chart.
|
||||
- Example: Sales by region and product type.
|
||||
"""
|
||||
|
||||
CHART_EXAMPLES = """
|
||||
### EXAMPLES ###
|
||||
|
||||
1. Bar Chart
|
||||
- Sample Data:
|
||||
[
|
||||
{"Region": "North", "Sales": 100},
|
||||
{"Region": "South", "Sales": 200},
|
||||
{"Region": "East", "Sales": 300},
|
||||
{"Region": "West", "Sales": 400}
|
||||
]
|
||||
- Chart Schema:
|
||||
{
|
||||
"title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>,
|
||||
"mark": {"type": "bar"},
|
||||
"encoding": {
|
||||
"x": {"field": "Region", "type": "nominal", "title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>},
|
||||
"y": {"field": "Sales", "type": "quantitative", "title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>},
|
||||
"color": {"field": "Region", "type": "nominal", "title": "<TITLE_IN_LANGUAGE_PROVIDED_BY_USER>"}
|
||||
}
|
||||
}
|
||||
2. Line Chart
|
||||
- Sample Data:
|
||||
[
|
||||
{"Date": "2022-01-01", "Sales": 100},
|
||||
{"Date": "2022-01-02", "Sales": 200},
|
||||
{"Date": "2022-01-03", "Sales": 300},
|
||||
{"Date": "2022-01-04", "Sales": 400}
|
||||
]
|
||||
- Chart Schema:
|
||||
{
|
||||
"title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>,
|
||||
"mark": {"type": "line"},
|
||||
"encoding": {
|
||||
"x": {"field": "Date", "type": "temporal", "title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>},
|
||||
"y": {"field": "Sales", "type": "quantitative", "title": <TITLE_IN_LANGUAGE_PROVIDED_BY_USER>}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
async def generate_chart(data: List[Dict[str, Any]], query: str) -> ChartGenerationResponse:
|
||||
trace_attributes = {
|
||||
"component": "chart_generation",
|
||||
"rows": len(data),
|
||||
}
|
||||
active_config = get_active_llm_config()
|
||||
|
||||
if not active_config:
|
||||
return ChartGenerationResponse(
|
||||
reasoning="No active LLM configuration found",
|
||||
can_visualize=False,
|
||||
chart_type=""
|
||||
)
|
||||
|
||||
try:
|
||||
provider = build_llm_provider(
|
||||
model=active_config.get("model"),
|
||||
provider=active_config.get("provider"),
|
||||
api_key=active_config.get("api_key"),
|
||||
api_base=active_config.get("api_base"),
|
||||
extra_headers=active_config.get("extra_headers") or {},
|
||||
)
|
||||
except Exception as e:
|
||||
return ChartGenerationResponse(
|
||||
reasoning=f"Failed to initialize LLM provider: {e}",
|
||||
can_visualize=False,
|
||||
chart_type=""
|
||||
)
|
||||
|
||||
# 2. Prepare Data Sample
|
||||
if not data:
|
||||
return ChartGenerationResponse(
|
||||
reasoning="No data provided to visualize",
|
||||
can_visualize=False,
|
||||
chart_type=""
|
||||
)
|
||||
|
||||
sample_size = 5
|
||||
sample_data = data[:sample_size]
|
||||
# Handle case where data might not be list of dicts
|
||||
if isinstance(data[0], (list, tuple)):
|
||||
# If it's a list of lists, we can't easily infer columns without more info.
|
||||
# For now, assume it's list of dicts as per postgres/clickhouse connector expectation (formatted_results)
|
||||
columns = [f"col_{i}" for i in range(len(data[0]))]
|
||||
else:
|
||||
columns = list(data[0].keys())
|
||||
|
||||
# 3. Construct Prompt
|
||||
schema_json = json.dumps(ChartGenerationResponse.model_json_schema(), ensure_ascii=False, separators=(",", ":"))
|
||||
|
||||
system_prompt = f"""You are a data analyst great at visualizing data using vega-lite! Given the user's question, sample data and sample column values, you need to generate vega-lite schema in JSON and provide suitable chart type.
|
||||
Besides, you need to give a concise and easy-to-understand reasoning to describe why you provide such vega-lite schema based on the question, sample data and sample column values.
|
||||
|
||||
{CHART_INSTRUCTIONS}
|
||||
|
||||
{CHART_EXAMPLES}
|
||||
|
||||
- If the user provides a custom instruction, it should be followed strictly and you should use it to change the style of response for reasoning.
|
||||
|
||||
### OUTPUT FORMAT ###
|
||||
|
||||
You must return a valid JSON object strictly matching the following JSON Schema:
|
||||
|
||||
{schema_json}
|
||||
|
||||
Please provide your chain of thought reasoning, chart type and the vega-lite schema in JSON format.
|
||||
"""
|
||||
|
||||
user_prompt = f"""
|
||||
### INPUT ###
|
||||
Question: {query}
|
||||
Sample Data: {json.dumps(sample_data, ensure_ascii=False, separators=(",", ":"), default=str)}
|
||||
Sample Column Values: {columns}
|
||||
Language: Chinese (Simplified)
|
||||
"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt}
|
||||
]
|
||||
|
||||
# 4. Call LLM
|
||||
try:
|
||||
with trace_service.start_span(
|
||||
"chart.generate",
|
||||
attributes={
|
||||
**trace_attributes,
|
||||
"model": active_config.get("model"),
|
||||
},
|
||||
input_payload={"query": query, "columns": columns},
|
||||
) as span:
|
||||
response = await provider.chat(
|
||||
messages=messages,
|
||||
max_tokens=CHART_MAX_TOKENS,
|
||||
temperature=CHART_TEMPERATURE,
|
||||
reasoning_effort=CHART_REASONING_EFFORT,
|
||||
)
|
||||
content = response.content
|
||||
if "```json" in content:
|
||||
content = content.split("```json")[1].split("```")[0]
|
||||
elif "```" in content:
|
||||
content = content.split("```")[1].split("```")[0]
|
||||
content = content.strip()
|
||||
result = json.loads(content)
|
||||
chart_result = ChartGenerationResponse(**result)
|
||||
span.set_attributes(
|
||||
{
|
||||
"chart.can_visualize": bool(chart_result.can_visualize),
|
||||
"chart.type": chart_result.chart_type,
|
||||
}
|
||||
)
|
||||
span.update(output={"chart_type": chart_result.chart_type})
|
||||
return chart_result
|
||||
except Exception as e:
|
||||
with trace_service.start_span(
|
||||
"chart.generate.error",
|
||||
attributes={**trace_attributes, **build_error_attributes(e, stage="chart_generation")},
|
||||
):
|
||||
pass
|
||||
return ChartGenerationResponse(
|
||||
reasoning=f"Failed to generate chart configuration: {str(e)}",
|
||||
can_visualize=False,
|
||||
chart_type=""
|
||||
)
|
||||
@@ -0,0 +1,720 @@
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import threading
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any, Callable, Awaitable
|
||||
from pydantic import BaseModel, Field
|
||||
import duckdb
|
||||
import pandas as pd
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Add project root to sys.path to allow importing nanobot
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
||||
NANOBOT_ROOT = PROJECT_ROOT / "agent-core"
|
||||
if str(NANOBOT_ROOT) not in sys.path:
|
||||
sys.path.append(str(NANOBOT_ROOT))
|
||||
|
||||
from app.core.llm_provider import build_llm_provider
|
||||
from app.connectors.postgres import postgres_connector
|
||||
from app.connectors.clickhouse import clickhouse_connector
|
||||
from app.connectors.factory import get_connector
|
||||
from app.schemas.chart import ChartGenerationResponse
|
||||
from app.agent.chart import generate_chart
|
||||
from app.database import SessionLocal
|
||||
from app.models.datasource import DataSource
|
||||
from app.core.files import resolve_upload_file_path
|
||||
from app.services.mdl import MDLService
|
||||
from app.services.llm_cache import get_active_llm_config
|
||||
from app.trace import trace_service
|
||||
|
||||
SCHEMA_CACHE_TTL_SECONDS = 300
|
||||
CONNECTION_CACHE_TTL_SECONDS = 30
|
||||
UPLOAD_CACHE_TTL_SECONDS = 900
|
||||
MAX_UPLOAD_CACHE_ITEMS = 8
|
||||
NL2SQL_MAX_TOKENS = 900
|
||||
NL2SQL_TEMPERATURE = 0.1
|
||||
NL2SQL_REASONING_EFFORT = "low"
|
||||
NL2SQL_LLM_TIMEOUT_SECONDS = int(os.getenv("NL2SQL_LLM_TIMEOUT_SECONDS", "90"))
|
||||
NL2SQL_LLM_REQUEST_TIMEOUT_SECONDS = int(os.getenv("NL2SQL_LLM_REQUEST_TIMEOUT_SECONDS", "45"))
|
||||
NL2SQL_LLM_RETRY_COUNT = int(os.getenv("NL2SQL_LLM_RETRY_COUNT", "0"))
|
||||
NL2SQL_SQL_EXEC_TIMEOUT_SECONDS = 60
|
||||
NL2SQL_CHART_TIMEOUT_SECONDS = int(os.getenv("NL2SQL_CHART_TIMEOUT_SECONDS", "45"))
|
||||
|
||||
_schema_cache: Dict[str, Dict[str, Any]] = {}
|
||||
_connection_cache: Dict[str, Dict[str, Any]] = {}
|
||||
_upload_cache: Dict[str, Dict[str, Any]] = {}
|
||||
_cache_lock = threading.Lock()
|
||||
|
||||
class NL2SQLRequest(BaseModel):
|
||||
query: str = Field(..., description="User's natural language query")
|
||||
source: str = Field(..., description="Data source to query (postgres, clickhouse, upload, ds:{id})")
|
||||
file_url: Optional[str] = Field(None, description="Uploaded file URL when source is upload")
|
||||
session_id: Optional[str] = Field(None, description="Conversation session identifier")
|
||||
generate_chart: bool = Field(False, description="Whether to generate chart specification")
|
||||
|
||||
class NL2SQLResponse(BaseModel):
|
||||
sql: str
|
||||
result: List[Dict[str, Any]]
|
||||
error: Optional[str] = None
|
||||
chart: Optional[ChartGenerationResponse] = None
|
||||
|
||||
# WrenAI-inspired SQL Rules
|
||||
DEFAULT_TEXT_TO_SQL_RULES = """
|
||||
### SQL RULES ###
|
||||
- ONLY USE SELECT statements, NO DELETE, UPDATE OR INSERT etc. statements that might change the data in the database.
|
||||
- ONLY USE the tables and columns mentioned in the database schema.
|
||||
- ONLY USE "*" if the user query asks for all the columns of a table.
|
||||
- ONLY CHOOSE columns belong to the tables mentioned in the database schema.
|
||||
- DON'T INCLUDE comments in the generated SQL query.
|
||||
- YOU MUST USE "JOIN" if you choose columns from multiple tables!
|
||||
- PREFER USING CTEs over subqueries.
|
||||
- When generating SQL query, always:
|
||||
- Put double quotes around column and table names.
|
||||
- Put single quotes around string literals.
|
||||
- Never quote numeric literals.
|
||||
For example: SELECT "customers"."customer_name" FROM "customers" WHERE "customers"."city" = 'Taipei' and "customers"."year" = 1992;
|
||||
- YOU MUST USE "lower(<table_name>.<column_name>) like lower(<value>)" function or "lower(<table_name>.<column_name>) = lower(<value>)" function for case-insensitive comparison!
|
||||
- Use "lower(<table_name>.<column_name>) LIKE lower(<value>)" when:
|
||||
- The user requests a pattern or partial match.
|
||||
- The value is not specific enough to be a single, exact value.
|
||||
- Wildcards (%) are needed to capture the pattern.
|
||||
- Use "lower(<table_name>.<column_name>) = lower(<value>)" when:
|
||||
- The user requests an exact, specific value.
|
||||
- There is no ambiguity or pattern in the value.
|
||||
- If the column is date/time related field, and it is a INT/BIGINT/DOUBLE/FLOAT type, please use the appropriate function mentioned in the SQL FUNCTIONS section to cast the column to "TIMESTAMP" type first before using it in the query
|
||||
- ALWAYS CAST the date/time related field to "TIMESTAMP WITH TIME ZONE" type when using them in the query
|
||||
- If the user asks for a specific date, please give the date range in SQL query
|
||||
- Aggregate functions are not allowed in the WHERE clause. Instead, they belong in the HAVING clause, which is used to filter after aggregation.
|
||||
- You can only add "ORDER BY" and "LIMIT" to the final "UNION" result.
|
||||
- For the ranking problem, you must use the ranking function, `DENSE_RANK()` to rank the results and then use `WHERE` clause to filter the results.
|
||||
- For the ranking problem, you must add the ranking column to the final SELECT clause.
|
||||
"""
|
||||
|
||||
TABLE_SELECTOR_SYSTEM_PROMPT = """
|
||||
You are a helpful assistant that identifies relevant database tables for a given natural language query.
|
||||
|
||||
Given the list of available tables and the user's question, return a list of table names that are likely to contain the information needed to answer the question.
|
||||
|
||||
### FINAL ANSWER FORMAT ###
|
||||
The final answer must be a JSON array of strings:
|
||||
[
|
||||
"table_name1",
|
||||
"table_name2"
|
||||
]
|
||||
"""
|
||||
|
||||
SQL_GENERATION_SYSTEM_PROMPT = """
|
||||
You are a helpful assistant that converts natural language queries into ANSI SQL queries.
|
||||
|
||||
Given user's question and database schema, generate accurate ANSI SQL directly and concisely.
|
||||
|
||||
### GENERAL RULES ###
|
||||
|
||||
1. YOU MUST FOLLOW the instructions strictly to generate the SQL query if the section of USER INSTRUCTIONS is available in user's input.
|
||||
2. YOU MUST FOLLOW SQL Rules if they are not contradicted with instructions.
|
||||
|
||||
""" + DEFAULT_TEXT_TO_SQL_RULES + """
|
||||
|
||||
### FEW-SHOT EXAMPLES ###
|
||||
Example 1:
|
||||
User's Question: 谁是去年前五个销售额最高的客户?
|
||||
Database Schema: {"customers": [{"name": "customer_id", "type": "INT"}, {"name": "customer_name", "type": "TEXT"}], "orders": [{"name": "order_id", "type": "INT"}, {"name": "customer_id", "type": "INT"}, {"name": "amount", "type": "DECIMAL"}, {"name": "order_date", "type": "DATE"}]}
|
||||
Final Answer:
|
||||
{
|
||||
"reasoning": "I need to join customers and orders, filter for last year (2025 if current is 2026), group by customer, sum the amount, and limit to top 5.",
|
||||
"sql": "SELECT \\"customers\\".\\"customer_name\\", SUM(\\"orders\\".\\"amount\\") AS \\"total_sales\\" FROM \\"customers\\" JOIN \\"orders\\" ON \\"customers\\".\\"customer_id\\" = \\"orders\\".\\"customer_id\\" WHERE \\"orders\\".\\"order_date\" BETWEEN '2025-01-01' AND '2025-12-31' GROUP BY \\"customers\\".\\"customer_name\\" ORDER BY \\"total_sales\\" DESC LIMIT 5;"
|
||||
}
|
||||
|
||||
### FINAL ANSWER FORMAT ###
|
||||
The final answer must be a ANSI SQL query in JSON format:
|
||||
|
||||
{
|
||||
"reasoning": <STEP_BY_STEP_REASONING_PLAN>,
|
||||
"sql": <SQL_QUERY_STRING>
|
||||
}
|
||||
"""
|
||||
|
||||
def _resolve_upload_file_path(file_url: Optional[str]) -> Path:
|
||||
try:
|
||||
return resolve_upload_file_path(file_url)
|
||||
except ValueError as e:
|
||||
raise ValueError(f"Invalid uploaded file URL: {e}")
|
||||
|
||||
def _load_upload_dataframe_from_path(file_path: Path) -> pd.DataFrame:
|
||||
suffix = file_path.suffix.lower()
|
||||
if suffix == ".csv":
|
||||
return pd.read_csv(file_path)
|
||||
if suffix in [".xls", ".xlsx"]:
|
||||
return pd.read_excel(file_path)
|
||||
if suffix == ".parquet":
|
||||
return pd.read_parquet(file_path)
|
||||
raise ValueError(f"Unsupported uploaded file type: {suffix}")
|
||||
|
||||
def _build_upload_schema(df: pd.DataFrame) -> Dict[str, List[Dict[str, str]]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
conn.register("uploaded_file", df)
|
||||
columns = conn.execute("DESCRIBE uploaded_file").fetchall()
|
||||
schema = {"uploaded_file": [{"name": col[0], "type": col[1]} for col in columns]}
|
||||
conn.close()
|
||||
return schema
|
||||
|
||||
def _get_upload_payload(file_url: Optional[str]) -> Dict[str, Any]:
|
||||
file_path = _resolve_upload_file_path(file_url)
|
||||
stat = file_path.stat()
|
||||
cache_key = f"{file_path}:{int(stat.st_mtime)}:{stat.st_size}"
|
||||
now = time.time()
|
||||
with _cache_lock:
|
||||
cached = _upload_cache.get(cache_key)
|
||||
if cached and now < cached["expires_at"]:
|
||||
return {"df": cached["df"], "schema": cached["schema"]}
|
||||
df = _load_upload_dataframe_from_path(file_path)
|
||||
schema = _build_upload_schema(df)
|
||||
with _cache_lock:
|
||||
if len(_upload_cache) >= MAX_UPLOAD_CACHE_ITEMS:
|
||||
oldest_key = min(_upload_cache.keys(), key=lambda key: _upload_cache[key]["expires_at"])
|
||||
_upload_cache.pop(oldest_key, None)
|
||||
_upload_cache[cache_key] = {
|
||||
"df": df,
|
||||
"schema": schema,
|
||||
"expires_at": now + UPLOAD_CACHE_TTL_SECONDS,
|
||||
}
|
||||
return {"df": df, "schema": schema}
|
||||
|
||||
def _execute_upload_sql(sql_query: str, df: pd.DataFrame) -> List[Dict[str, Any]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
conn.register("uploaded_file", df)
|
||||
result_df = conn.execute(sql_query).df()
|
||||
conn.close()
|
||||
return result_df.to_dict(orient="records")
|
||||
|
||||
def _to_number(value: Any) -> Optional[float]:
|
||||
if isinstance(value, bool):
|
||||
return None
|
||||
if isinstance(value, (int, float)):
|
||||
return float(value)
|
||||
if isinstance(value, str):
|
||||
text = value.strip().replace(",", "")
|
||||
if not text:
|
||||
return None
|
||||
try:
|
||||
return float(text)
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
# _build_fallback_chart removed as per user request to not hardcode fallbacks
|
||||
|
||||
def _build_schema_cache_key(source: str, connector: Any) -> str:
|
||||
# If source is ds:ID, that's already a good key
|
||||
if source.startswith("ds:"):
|
||||
return source
|
||||
|
||||
if source == "postgres":
|
||||
return f"postgres:{getattr(connector, 'db_url', '')}"
|
||||
if source == "clickhouse":
|
||||
return (
|
||||
f"clickhouse:{getattr(connector, 'host', '')}:{getattr(connector, 'port', '')}:"
|
||||
f"{getattr(connector, 'user', '')}:{getattr(connector, 'database', '')}"
|
||||
)
|
||||
return source
|
||||
|
||||
def _get_cached_schema(source: str, connector: Any) -> Optional[Dict[str, List[Dict[str, str]]]]:
|
||||
key = _build_schema_cache_key(source, connector)
|
||||
now = time.time()
|
||||
with _cache_lock:
|
||||
cached = _schema_cache.get(key)
|
||||
if cached and now < cached["expires_at"]:
|
||||
return cached["schema"]
|
||||
return None
|
||||
|
||||
def _set_cached_schema(source: str, connector: Any, schema: Dict[str, List[Dict[str, str]]]) -> None:
|
||||
key = _build_schema_cache_key(source, connector)
|
||||
with _cache_lock:
|
||||
_schema_cache[key] = {"schema": schema, "expires_at": time.time() + SCHEMA_CACHE_TTL_SECONDS}
|
||||
|
||||
async def _check_connection_with_cache(source: str, connector: Any) -> bool:
|
||||
cache_key = _build_schema_cache_key(source, connector)
|
||||
now = time.time()
|
||||
with _cache_lock:
|
||||
cached = _connection_cache.get(cache_key)
|
||||
if cached and now < cached["expires_at"]:
|
||||
return bool(cached["ok"])
|
||||
|
||||
# Run synchronous test_connection in a separate thread to avoid blocking event loop
|
||||
try:
|
||||
ok = await asyncio.wait_for(
|
||||
asyncio.to_thread(connector.test_connection),
|
||||
timeout=15.0
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
print("Connection test failed or timed out: Timeout after 15 seconds")
|
||||
ok = False
|
||||
except Exception as e:
|
||||
print(f"Connection test failed or timed out: {e}")
|
||||
ok = False
|
||||
|
||||
with _cache_lock:
|
||||
_connection_cache[cache_key] = {"ok": ok, "expires_at": now + CONNECTION_CACHE_TTL_SECONDS}
|
||||
return ok
|
||||
|
||||
async def _select_relevant_tables(
|
||||
query: str,
|
||||
schema: Dict[str, Any],
|
||||
provider: Any,
|
||||
on_progress: Callable[[str], Awaitable[None]] | None = None
|
||||
) -> List[str]:
|
||||
"""Use LLM to select relevant tables from the schema to reduce context size."""
|
||||
table_names = list(schema.keys())
|
||||
if len(table_names) <= 5:
|
||||
return table_names
|
||||
|
||||
if on_progress:
|
||||
await on_progress("正在进行语义表搜索")
|
||||
|
||||
user_prompt = f"User's Question: {query}\nAvailable Tables: {', '.join(table_names)}"
|
||||
messages = [
|
||||
{"role": "system", "content": TABLE_SELECTOR_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt}
|
||||
]
|
||||
|
||||
try:
|
||||
response = await asyncio.wait_for(
|
||||
provider.chat(
|
||||
messages=messages,
|
||||
max_tokens=200,
|
||||
temperature=0.0,
|
||||
),
|
||||
timeout=30.0
|
||||
)
|
||||
content = (response.content or "").strip()
|
||||
if "```json" in content:
|
||||
content = content.split("```json")[1].split("```")[0]
|
||||
elif "```" in content:
|
||||
content = content.split("```")[1].split("```")[0]
|
||||
|
||||
selected = json.loads(content.strip())
|
||||
if isinstance(selected, list):
|
||||
# Filter valid table names
|
||||
valid_selected = [t for t in selected if t in table_names]
|
||||
if valid_selected:
|
||||
return valid_selected
|
||||
except Exception as e:
|
||||
logger.warning(f"Table selection failed: {e}")
|
||||
|
||||
return table_names
|
||||
|
||||
async def _fetch_sample_data(
|
||||
connector: Any,
|
||||
table_names: List[str],
|
||||
on_progress: Callable[[str], Awaitable[None]] | None = None
|
||||
) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""Fetch sample rows for selected tables to help LLM understand data."""
|
||||
samples = {}
|
||||
if not connector or not hasattr(connector, "execute_query"):
|
||||
return samples
|
||||
|
||||
if on_progress:
|
||||
await on_progress(f"正在抓取 {len(table_names)} 張表的樣本數據")
|
||||
|
||||
for table in table_names:
|
||||
try:
|
||||
# We use a very small limit
|
||||
query = f"SELECT * FROM \"{table}\" LIMIT 3"
|
||||
results = await asyncio.wait_for(
|
||||
asyncio.to_thread(connector.execute_query, query),
|
||||
timeout=10.0
|
||||
)
|
||||
|
||||
rows = []
|
||||
if isinstance(results, list):
|
||||
if results and isinstance(results[0], dict):
|
||||
rows = results
|
||||
elif results and isinstance(results[0], (list, tuple)):
|
||||
rows = [list(row) for row in results]
|
||||
else:
|
||||
rows = results
|
||||
elif isinstance(results, tuple) and len(results) == 2:
|
||||
rows_raw, cols = results
|
||||
col_names = [c[0] for c in cols]
|
||||
rows = [dict(zip(col_names, row)) for row in rows_raw]
|
||||
|
||||
if rows:
|
||||
samples[table] = rows
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch sample for {table}: {e}")
|
||||
|
||||
return samples
|
||||
|
||||
async def process_nl2sql(
|
||||
request: NL2SQLRequest,
|
||||
on_progress: Callable[[str], Awaitable[None]] | None = None,
|
||||
) -> NL2SQLResponse:
|
||||
async def emit_progress(content: str) -> None:
|
||||
if on_progress and content:
|
||||
await on_progress(content)
|
||||
|
||||
total_started = time.perf_counter()
|
||||
trace_base_attributes = {
|
||||
"component": "nl2sql",
|
||||
"source": request.source,
|
||||
"session_id": request.session_id,
|
||||
"generate_chart": request.generate_chart,
|
||||
}
|
||||
# 1. Get the connector and schema
|
||||
connector = None
|
||||
schema = {}
|
||||
upload_df: Optional[pd.DataFrame] = None
|
||||
|
||||
if request.source == "postgres":
|
||||
connector = postgres_connector
|
||||
elif request.source == "clickhouse":
|
||||
connector = clickhouse_connector
|
||||
elif request.source == "upload":
|
||||
try:
|
||||
upload_started = time.perf_counter()
|
||||
upload_payload = await asyncio.to_thread(_get_upload_payload, request.file_url)
|
||||
upload_df = upload_payload["df"]
|
||||
schema = upload_payload["schema"]
|
||||
await emit_progress(f"上传文件加载完成 ({time.perf_counter() - upload_started:.2f}s)")
|
||||
except Exception as e:
|
||||
return NL2SQLResponse(sql="", result=[], error=f"Failed to load uploaded file: {e}")
|
||||
elif request.source.startswith("ds:"):
|
||||
try:
|
||||
ds_started = time.perf_counter()
|
||||
ds_id = int(request.source.split(":")[1])
|
||||
|
||||
def _get_ds_connector():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
ds = db.query(DataSource).filter(DataSource.id == ds_id).first()
|
||||
if not ds:
|
||||
return None
|
||||
return get_connector(ds)
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
connector = await asyncio.to_thread(_get_ds_connector)
|
||||
if not connector:
|
||||
return NL2SQLResponse(sql="", result=[], error=f"Data source not found: {request.source}")
|
||||
|
||||
await emit_progress(f"数据源配置读取完成 ({time.perf_counter() - ds_started:.2f}s)")
|
||||
except ValueError:
|
||||
return NL2SQLResponse(sql="", result=[], error=f"Invalid data source ID: {request.source}")
|
||||
except Exception as e:
|
||||
return NL2SQLResponse(sql="", result=[], error=f"Failed to load data source: {e}")
|
||||
else:
|
||||
return NL2SQLResponse(sql="", result=[], error=f"Unsupported data source: {request.source}")
|
||||
|
||||
if connector:
|
||||
await emit_progress("正在检测数据源连通性")
|
||||
cached_schema = _get_cached_schema(request.source, connector)
|
||||
if cached_schema is not None:
|
||||
schema = cached_schema
|
||||
await emit_progress(f"命中 Schema 缓存,已加载 {len(schema)} 张表")
|
||||
else:
|
||||
conn_started = time.perf_counter()
|
||||
if not await _check_connection_with_cache(request.source, connector):
|
||||
return NL2SQLResponse(sql="", result=[], error=f"Failed to connect to {request.source}")
|
||||
await emit_progress(f"连接检测完成 ({time.perf_counter() - conn_started:.2f}s)")
|
||||
schema_started = time.perf_counter()
|
||||
try:
|
||||
schema = await asyncio.wait_for(
|
||||
asyncio.to_thread(connector.get_schema),
|
||||
timeout=120.0
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
return NL2SQLResponse(sql="", result=[], error="Failed to fetch schema: Timeout after 120 seconds. Data source might be too large or network is slow.")
|
||||
except Exception as e:
|
||||
return NL2SQLResponse(sql="", result=[], error=f"Failed to fetch schema: {e}")
|
||||
|
||||
_set_cached_schema(request.source, connector, schema)
|
||||
await emit_progress(f"Schema 拉取完成,共 {len(schema)} 张表 ({time.perf_counter() - schema_started:.2f}s)")
|
||||
|
||||
# 2. Get the active LLM config
|
||||
active_config = get_active_llm_config()
|
||||
|
||||
if not active_config:
|
||||
return NL2SQLResponse(sql="", result=[], error="No active LLM configuration found")
|
||||
|
||||
# 3. Initialize Provider
|
||||
try:
|
||||
provider = build_llm_provider(
|
||||
model=active_config.get("model"),
|
||||
provider=active_config.get("provider"),
|
||||
api_key=active_config.get("api_key"),
|
||||
api_base=active_config.get("api_base"),
|
||||
extra_headers=active_config.get("extra_headers") or {},
|
||||
)
|
||||
except Exception as e:
|
||||
return NL2SQLResponse(sql="", result=[], error=f"Failed to initialize LLM provider: {e}")
|
||||
|
||||
# 4. Table Selection and Sample Data (Optimization)
|
||||
relevant_tables = await _select_relevant_tables(request.query, schema, provider, emit_progress)
|
||||
pruned_schema = {t: schema[t] for t in relevant_tables if t in schema}
|
||||
|
||||
samples = {}
|
||||
if request.source != "upload": # For upload, df is already in memory and small
|
||||
samples = await _fetch_sample_data(connector, relevant_tables, emit_progress)
|
||||
|
||||
schema_str = json.dumps(pruned_schema, ensure_ascii=False, separators=(",", ":"))
|
||||
samples_str = json.dumps(samples, ensure_ascii=False, separators=(",", ":")) if samples else ""
|
||||
|
||||
# Try to load MDL context
|
||||
mdl_context = ""
|
||||
if request.source.startswith("ds:"):
|
||||
try:
|
||||
ds_id = int(request.source.split(":")[1])
|
||||
mdl = await asyncio.to_thread(MDLService.get_mdl, ds_id)
|
||||
if mdl:
|
||||
mdl_lines = ["\n### SEMANTIC MODEL (WrenMDL) ###"]
|
||||
|
||||
mdl_lines.append("MODELS:")
|
||||
for model in mdl.models:
|
||||
# Only include relevant models
|
||||
if model.name not in relevant_tables and (model.tableReference and model.tableReference.table not in relevant_tables):
|
||||
continue
|
||||
|
||||
table_ref = model.tableReference.table if model.tableReference else model.name
|
||||
desc = f" - Description: {model.properties.get('description', '')}" if model.properties.get('description') else ""
|
||||
mdl_lines.append(f"- Model: {model.name} (Table: {table_ref}){desc}")
|
||||
|
||||
if model.columns:
|
||||
mdl_lines.append(" Columns:")
|
||||
for col in model.columns:
|
||||
col_desc = f" ({col.properties.get('description')})" if col.properties.get('description') else ""
|
||||
expr = f" [Calculated: {col.expression}]" if col.isCalculated else ""
|
||||
mdl_lines.append(f" - {col.name} ({col.type}){col_desc}{expr}")
|
||||
|
||||
if mdl.relationships:
|
||||
mdl_lines.append("\nRELATIONSHIPS:")
|
||||
for rel in mdl.relationships:
|
||||
# Only include relevant relationships
|
||||
rel_models = rel.models if isinstance(rel.models, list) else []
|
||||
if any(m in relevant_tables for m in rel_models):
|
||||
mdl_lines.append(f"- {rel.name}: {rel.joinType} between {rel.models} ON {rel.condition}")
|
||||
|
||||
mdl_context = "\n".join(mdl_lines)
|
||||
except Exception as e:
|
||||
print(f"Failed to load MDL: {e}")
|
||||
|
||||
# 5. Construct Prompt
|
||||
user_prompt = f"""
|
||||
### DATABASE SCHEMA ###
|
||||
{schema_str}
|
||||
{mdl_context}
|
||||
|
||||
### SAMPLE DATA ###
|
||||
{samples_str}
|
||||
|
||||
### INPUTS ###
|
||||
User's Question: {request.query}
|
||||
Language: Chinese (Simplified)
|
||||
"""
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": SQL_GENERATION_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt}
|
||||
]
|
||||
|
||||
# 5. Call LLM & 6. Execute SQL (with Self-Correction Loop)
|
||||
MAX_SQL_EXEC_RETRIES = int(os.getenv("NL2SQL_MAX_EXEC_RETRIES", "2"))
|
||||
sql_query = ""
|
||||
formatted_results = []
|
||||
chart_response = None
|
||||
timeout_stage = "llm_generation"
|
||||
|
||||
for exec_attempt in range(MAX_SQL_EXEC_RETRIES + 1):
|
||||
try:
|
||||
llm_started = time.perf_counter()
|
||||
if exec_attempt == 0:
|
||||
await emit_progress("正在生成 SQL")
|
||||
else:
|
||||
await emit_progress(f"正在尝试修复 SQL ({exec_attempt}/{MAX_SQL_EXEC_RETRIES})")
|
||||
|
||||
response = None
|
||||
last_error = ""
|
||||
|
||||
for attempt in range(NL2SQL_LLM_RETRY_COUNT + 1):
|
||||
try:
|
||||
with trace_service.start_span(
|
||||
"nl2sql.llm_generation",
|
||||
attributes={
|
||||
**trace_base_attributes,
|
||||
"exec_attempt": exec_attempt,
|
||||
"retry_attempt": attempt,
|
||||
"model": active_config.get("model"),
|
||||
},
|
||||
) as llm_span:
|
||||
response = await asyncio.wait_for(
|
||||
provider.chat(
|
||||
messages=messages,
|
||||
max_tokens=NL2SQL_MAX_TOKENS,
|
||||
temperature=NL2SQL_TEMPERATURE,
|
||||
reasoning_effort=NL2SQL_REASONING_EFFORT,
|
||||
),
|
||||
timeout=NL2SQL_LLM_TIMEOUT_SECONDS,
|
||||
)
|
||||
llm_span.update(output={"finish_reason": getattr(response, "finish_reason", None)})
|
||||
except asyncio.TimeoutError:
|
||||
last_error = f"LLM generation timeout after {NL2SQL_LLM_TIMEOUT_SECONDS}s"
|
||||
if attempt < NL2SQL_LLM_RETRY_COUNT:
|
||||
await emit_progress(f"SQL 生成超时,正在重试 ({attempt + 1}/{NL2SQL_LLM_RETRY_COUNT})")
|
||||
continue
|
||||
return NL2SQLResponse(sql=sql_query, result=[], error=last_error)
|
||||
except Exception as e:
|
||||
last_error = f"LLM generation failed: {e}"
|
||||
if attempt < NL2SQL_LLM_RETRY_COUNT:
|
||||
await emit_progress(f"SQL 生成失败,正在重试 ({attempt + 1}/{NL2SQL_LLM_RETRY_COUNT})")
|
||||
continue
|
||||
return NL2SQLResponse(sql=sql_query, result=[], error=last_error)
|
||||
|
||||
if response.finish_reason == "error":
|
||||
last_error = response.content or "LLM Error"
|
||||
if attempt < NL2SQL_LLM_RETRY_COUNT:
|
||||
await emit_progress(f"模型返回错误,正在重试 ({attempt + 1}/{NL2SQL_LLM_RETRY_COUNT})")
|
||||
continue
|
||||
return NL2SQLResponse(sql=sql_query, result=[], error=last_error)
|
||||
break
|
||||
|
||||
if response is None:
|
||||
return NL2SQLResponse(sql=sql_query, result=[], error=last_error or "LLM generation failed")
|
||||
|
||||
content = (response.content or "").strip()
|
||||
if not content:
|
||||
return NL2SQLResponse(sql=sql_query, result=[], error="LLM returned empty response")
|
||||
|
||||
# Clean up code blocks
|
||||
if "```json" in content:
|
||||
content = content.split("```json")[1].split("```")[0]
|
||||
elif "```" in content:
|
||||
content = content.split("```")[1].split("```")[0]
|
||||
|
||||
content = content.strip()
|
||||
|
||||
try:
|
||||
result_json = json.loads(content)
|
||||
sql_query = result_json.get("sql", "").strip()
|
||||
except json.JSONDecodeError:
|
||||
# Fallback if LLM doesn't return valid JSON despite instructions
|
||||
sql_query = content
|
||||
|
||||
logger.info(f"Generated SQL for query '{request.query}':\n{sql_query}")
|
||||
|
||||
# 格式化单行 SQL 用于在前端进度中展示
|
||||
formatted_sql = sql_query.replace('\n', ' ')
|
||||
if len(formatted_sql) > 150:
|
||||
formatted_sql = formatted_sql[:147] + "..."
|
||||
await emit_progress(f"SQL 生成完成: {formatted_sql}")
|
||||
|
||||
except Exception as e:
|
||||
return NL2SQLResponse(sql=sql_query, result=[], error=f"LLM generation failed: {e}")
|
||||
|
||||
# 6. Execute SQL
|
||||
try:
|
||||
timeout_stage = "sql_execution"
|
||||
sql_exec_started = time.perf_counter()
|
||||
await emit_progress("正在执行 SQL 查询")
|
||||
with trace_service.start_span(
|
||||
"nl2sql.sql_execution",
|
||||
attributes={
|
||||
**trace_base_attributes,
|
||||
"exec_attempt": exec_attempt,
|
||||
},
|
||||
input_payload={"sql": sql_query},
|
||||
) as sql_span:
|
||||
if request.source == "upload":
|
||||
if upload_df is None:
|
||||
upload_payload = await asyncio.to_thread(_get_upload_payload, request.file_url)
|
||||
upload_df = upload_payload["df"]
|
||||
formatted_results = await asyncio.wait_for(
|
||||
asyncio.to_thread(_execute_upload_sql, sql_query, upload_df),
|
||||
timeout=NL2SQL_SQL_EXEC_TIMEOUT_SECONDS,
|
||||
)
|
||||
else:
|
||||
results = await asyncio.wait_for(
|
||||
asyncio.to_thread(connector.execute_query, sql_query),
|
||||
timeout=NL2SQL_SQL_EXEC_TIMEOUT_SECONDS,
|
||||
)
|
||||
formatted_results = []
|
||||
if isinstance(results, list):
|
||||
if results and isinstance(results[0], dict):
|
||||
formatted_results = results
|
||||
elif results and isinstance(results[0], (list, tuple)):
|
||||
formatted_results = [list(row) for row in results]
|
||||
else:
|
||||
formatted_results = results
|
||||
elif isinstance(results, tuple) and len(results) == 2:
|
||||
rows, cols = results
|
||||
col_names = [c[0] for c in cols]
|
||||
formatted_results = [dict(zip(col_names, row)) for row in rows]
|
||||
else:
|
||||
formatted_results = []
|
||||
sql_span.set_attributes({"result_rows": len(formatted_results)})
|
||||
|
||||
await emit_progress(f"SQL 执行完成,返回 {len(formatted_results)} 行 ({time.perf_counter() - sql_exec_started:.2f}s)")
|
||||
break # Execution succeeded, break the retry loop
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
return NL2SQLResponse(sql=sql_query, result=[], error=f"SQL execution timeout after {NL2SQL_SQL_EXEC_TIMEOUT_SECONDS}s")
|
||||
except Exception as e:
|
||||
if exec_attempt < MAX_SQL_EXEC_RETRIES:
|
||||
await emit_progress(f"SQL 执行失败,准备自动修复 ({exec_attempt + 1}/{MAX_SQL_EXEC_RETRIES})")
|
||||
messages.append({"role": "assistant", "content": f"```json\n{{\"sql\": \"{sql_query}\"}}\n```"})
|
||||
messages.append({
|
||||
"role": "user",
|
||||
"content": f"The generated SQL failed to execute. Database error:\n{str(e)}\n\nPlease fix the SQL query to resolve this error and provide the corrected version following the exact same JSON format."
|
||||
})
|
||||
continue
|
||||
else:
|
||||
return NL2SQLResponse(sql=sql_query, result=[], error=f"SQL execution failed after {MAX_SQL_EXEC_RETRIES} retries: {e}")
|
||||
|
||||
# 7. Generate Chart
|
||||
if request.generate_chart and formatted_results:
|
||||
try:
|
||||
chart_started = time.perf_counter()
|
||||
await emit_progress("正在生成可视化方案")
|
||||
timeout_stage = "chart_generation"
|
||||
with trace_service.start_span(
|
||||
"nl2sql.chart_generation",
|
||||
attributes=trace_base_attributes,
|
||||
input_payload={"query": request.query, "rows": len(formatted_results)},
|
||||
) as chart_span:
|
||||
chart_response = await asyncio.wait_for(
|
||||
generate_chart(formatted_results, request.query),
|
||||
timeout=NL2SQL_CHART_TIMEOUT_SECONDS,
|
||||
)
|
||||
chart_span.set_attributes(
|
||||
{
|
||||
"chart.can_visualize": bool(getattr(chart_response, "can_visualize", False)),
|
||||
"chart.type": getattr(chart_response, "chart_type", ""),
|
||||
}
|
||||
)
|
||||
await emit_progress(f"可视化方案生成完成 ({time.perf_counter() - chart_started:.2f}s)")
|
||||
except asyncio.TimeoutError:
|
||||
fallback_chart = ChartGenerationResponse(
|
||||
reasoning=f"Chart generation timeout after {NL2SQL_CHART_TIMEOUT_SECONDS}s",
|
||||
chart_type="",
|
||||
can_visualize=False,
|
||||
chart_spec=None,
|
||||
)
|
||||
return NL2SQLResponse(sql=sql_query, result=formatted_results, chart=fallback_chart)
|
||||
except Exception as e:
|
||||
pass # Ignore chart generation errors, return data only
|
||||
|
||||
with trace_service.start_span(
|
||||
"nl2sql.completed",
|
||||
attributes={
|
||||
**trace_base_attributes,
|
||||
"total_seconds": round(time.perf_counter() - total_started, 4),
|
||||
"result_rows": len(formatted_results),
|
||||
"has_chart": bool(chart_response),
|
||||
},
|
||||
):
|
||||
pass
|
||||
await emit_progress(f"NL2SQL 总耗时 {time.perf_counter() - total_started:.2f}s")
|
||||
return NL2SQLResponse(sql=sql_query, result=formatted_results, chart=chart_response)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,139 @@
|
||||
from typing import List, Dict, Any, Optional
|
||||
from fastapi import APIRouter, HTTPException, Depends, status
|
||||
from sqlalchemy.orm import Session
|
||||
from app.database import get_db
|
||||
from app.models.datasource import DataSource
|
||||
from app.schemas.datasource import DataSourceCreate, DataSourceUpdate, DataSource as DataSourceSchema, DataSourceTestRequest
|
||||
from app.core.security import get_current_user, get_admin_user, CurrentUser
|
||||
from app.connectors.factory import get_connector_from_config
|
||||
from pydantic import BaseModel
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/datasources", response_model=List[DataSourceSchema])
|
||||
def list_datasources(
|
||||
project_id: Optional[int] = None,
|
||||
skip: int = 0,
|
||||
limit: int = 100,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
query = db.query(DataSource)
|
||||
if project_id:
|
||||
query = query.filter(DataSource.project_id == project_id)
|
||||
|
||||
# If not admin, check if user has access to the project
|
||||
if not current_user.is_admin and project_id:
|
||||
from app.models.project import Project
|
||||
project = db.query(Project).filter(Project.id == project_id).first()
|
||||
if not project or project.owner_id != current_user.id:
|
||||
raise HTTPException(status_code=403, detail="Not enough permissions for this project")
|
||||
|
||||
datasources = query.offset(skip).limit(limit).all()
|
||||
|
||||
# Hide sensitive info for non-admins if necessary, but config usually contains secrets.
|
||||
# Maybe we should return a sanitized version for regular users?
|
||||
# For now, return full config but only to admins?
|
||||
# Or just assume the API is secure.
|
||||
# If regular users need to select datasource, they just need ID and Name.
|
||||
if not current_user.is_admin:
|
||||
# Sanitize config
|
||||
sanitized = []
|
||||
for ds in datasources:
|
||||
ds_dict = DataSourceSchema.from_orm(ds).dict()
|
||||
# Remove sensitive fields from config
|
||||
if ds_dict.get("config"):
|
||||
ds_dict["config"] = {k: v for k, v in ds_dict["config"].items() if k not in ["password", "api_key", "secret"]}
|
||||
sanitized.append(ds_dict)
|
||||
return sanitized
|
||||
|
||||
return datasources
|
||||
|
||||
@router.post("/datasources", response_model=DataSourceSchema)
|
||||
def create_datasource(
|
||||
datasource: DataSourceCreate,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
# Check if project exists and user has access
|
||||
from app.models.project import Project
|
||||
project = db.query(Project).filter(Project.id == datasource.project_id).first()
|
||||
if not project:
|
||||
raise HTTPException(status_code=404, detail="Project not found")
|
||||
|
||||
if not current_user.is_admin and project.owner_id != current_user.id:
|
||||
raise HTTPException(status_code=403, detail="Not enough permissions for this project")
|
||||
|
||||
db_datasource = DataSource(**datasource.dict())
|
||||
db.add(db_datasource)
|
||||
db.commit()
|
||||
db.refresh(db_datasource)
|
||||
return db_datasource
|
||||
|
||||
@router.get("/datasources/{datasource_id}", response_model=DataSourceSchema)
|
||||
def read_datasource(
|
||||
datasource_id: int,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
db_datasource = db.query(DataSource).filter(DataSource.id == datasource_id).first()
|
||||
if db_datasource is None:
|
||||
raise HTTPException(status_code=404, detail="Data source not found")
|
||||
|
||||
if not current_user.is_admin:
|
||||
ds_dict = DataSourceSchema.from_orm(db_datasource).dict()
|
||||
if ds_dict.get("config"):
|
||||
ds_dict["config"] = {k: v for k, v in ds_dict["config"].items() if k not in ["password", "api_key", "secret"]}
|
||||
return ds_dict
|
||||
|
||||
return db_datasource
|
||||
|
||||
@router.put("/datasources/{datasource_id}", response_model=DataSourceSchema)
|
||||
def update_datasource(
|
||||
datasource_id: int,
|
||||
datasource: DataSourceUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
_: CurrentUser = Depends(get_admin_user)
|
||||
):
|
||||
db_datasource = db.query(DataSource).filter(DataSource.id == datasource_id).first()
|
||||
if db_datasource is None:
|
||||
raise HTTPException(status_code=404, detail="Data source not found")
|
||||
|
||||
update_data = datasource.dict(exclude_unset=True)
|
||||
for key, value in update_data.items():
|
||||
setattr(db_datasource, key, value)
|
||||
|
||||
db.commit()
|
||||
db.refresh(db_datasource)
|
||||
return db_datasource
|
||||
|
||||
@router.delete("/datasources/{datasource_id}")
|
||||
def delete_datasource(
|
||||
datasource_id: int,
|
||||
db: Session = Depends(get_db),
|
||||
_: CurrentUser = Depends(get_admin_user)
|
||||
):
|
||||
db_datasource = db.query(DataSource).filter(DataSource.id == datasource_id).first()
|
||||
if db_datasource is None:
|
||||
raise HTTPException(status_code=404, detail="Data source not found")
|
||||
|
||||
db.delete(db_datasource)
|
||||
db.commit()
|
||||
return {"ok": True}
|
||||
|
||||
@router.post("/datasources/test")
|
||||
def test_datasource_connection(
|
||||
request: DataSourceTestRequest,
|
||||
_: CurrentUser = Depends(get_admin_user)
|
||||
):
|
||||
try:
|
||||
connector = get_connector_from_config(request.type, request.config)
|
||||
if connector.test_connection():
|
||||
return {"success": True, "message": "Connection successful"}
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="Connection failed")
|
||||
except Exception as e:
|
||||
import traceback
|
||||
import sys
|
||||
print(f"Datasource Test Error: {str(e)}\n{traceback.format_exc()}", file=sys.stderr)
|
||||
raise HTTPException(status_code=400, detail=f"Connection failed: {str(e)}")
|
||||
@@ -0,0 +1,96 @@
|
||||
from typing import List, Optional
|
||||
from fastapi import APIRouter, HTTPException, Depends
|
||||
from openai import OpenAI
|
||||
|
||||
from app.schemas.embedding_model import (
|
||||
EmbeddingModelConfig,
|
||||
EmbeddingModelConfigCreate,
|
||||
EmbeddingModelConfigUpdate,
|
||||
EmbeddingModelConnectionTestRequest
|
||||
)
|
||||
from app.services.embedding_model_store import embedding_model_store
|
||||
from app.services.openai_compat import normalize_openai_base_url
|
||||
from app.api.llm import get_admin_user, get_current_user, CurrentUser
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
def _mask_api_key(value: Optional[str]) -> Optional[str]:
|
||||
if not value:
|
||||
return None
|
||||
if len(value) <= 8:
|
||||
return "*" * len(value)
|
||||
return f"{value[:4]}{'*' * (len(value) - 8)}{value[-4:]}"
|
||||
|
||||
@router.get("/embedding-models", response_model=List[EmbeddingModelConfig])
|
||||
def list_embedding_models(current_user: CurrentUser = Depends(get_current_user)):
|
||||
models = embedding_model_store.list_models()
|
||||
for m in models:
|
||||
if not current_user.is_admin:
|
||||
m["api_key"] = None
|
||||
return models
|
||||
|
||||
@router.post("/embedding-models", response_model=EmbeddingModelConfig)
|
||||
def create_embedding_model(payload: EmbeddingModelConfigCreate, _: CurrentUser = Depends(get_admin_user)):
|
||||
return embedding_model_store.create_model(payload.model_dump())
|
||||
|
||||
@router.get("/embedding-models/{model_id}", response_model=EmbeddingModelConfig)
|
||||
def get_embedding_model(model_id: str, current_user: CurrentUser = Depends(get_current_user)):
|
||||
model = embedding_model_store.get_model(model_id)
|
||||
if not model:
|
||||
raise HTTPException(status_code=404, detail="Embedding model not found")
|
||||
if not current_user.is_admin:
|
||||
model["api_key"] = None
|
||||
return model
|
||||
|
||||
@router.put("/embedding-models/{model_id}", response_model=EmbeddingModelConfig)
|
||||
def update_embedding_model(model_id: str, payload: EmbeddingModelConfigUpdate, _: CurrentUser = Depends(get_admin_user)):
|
||||
model = embedding_model_store.update_model(model_id, payload.model_dump(exclude_unset=True))
|
||||
if not model:
|
||||
raise HTTPException(status_code=404, detail="Embedding model not found")
|
||||
return model
|
||||
|
||||
@router.delete("/embedding-models/{model_id}")
|
||||
def delete_embedding_model(model_id: str, _: CurrentUser = Depends(get_admin_user)):
|
||||
deleted = embedding_model_store.delete_model(model_id)
|
||||
if not deleted:
|
||||
raise HTTPException(status_code=404, detail="Embedding model not found")
|
||||
return {"status": "success"}
|
||||
|
||||
@router.post("/embedding-models/test")
|
||||
def test_embedding_model_connection(payload: EmbeddingModelConnectionTestRequest, _: CurrentUser = Depends(get_admin_user)):
|
||||
api_base = normalize_openai_base_url(payload.api_base or "")
|
||||
api_key = payload.api_key
|
||||
model_name = (payload.model or "").strip()
|
||||
|
||||
if not api_base:
|
||||
raise HTTPException(status_code=400, detail="API Base is required")
|
||||
if not api_key:
|
||||
raise HTTPException(status_code=400, detail="API Key is required")
|
||||
if not model_name:
|
||||
raise HTTPException(status_code=400, detail="Model name is required")
|
||||
|
||||
try:
|
||||
client = OpenAI(
|
||||
api_key=api_key,
|
||||
base_url=api_base,
|
||||
)
|
||||
embedding_resp = client.embeddings.create(
|
||||
model=model_name,
|
||||
input="connection test",
|
||||
)
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=400, detail=f"Embedding call failed: {exc}")
|
||||
|
||||
dimension = None
|
||||
if getattr(embedding_resp, "data", None):
|
||||
first = embedding_resp.data[0]
|
||||
vector = getattr(first, "embedding", None)
|
||||
if isinstance(vector, list):
|
||||
dimension = len(vector)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"message": "Connection successful",
|
||||
"model_name": model_name,
|
||||
"embedding_dimension": dimension,
|
||||
}
|
||||
@@ -0,0 +1,302 @@
|
||||
from typing import List, Optional
|
||||
import io
|
||||
import json
|
||||
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi import UploadFile, File, Form
|
||||
from openai import OpenAI
|
||||
import pandas as pd
|
||||
|
||||
from app.schemas.knowledge import (
|
||||
KnowledgeBase,
|
||||
KnowledgeBaseCreate,
|
||||
KnowledgeConnectionTestRequest,
|
||||
KnowledgeConnectionTestResponse,
|
||||
KnowledgeGlobalConfig,
|
||||
KnowledgeGlobalConfigUpdate,
|
||||
KnowledgeBaseUpdate,
|
||||
KnowledgeDocument,
|
||||
KnowledgeDocumentCreate,
|
||||
KnowledgeDocumentUpdate,
|
||||
KnowledgeSearchRequest,
|
||||
KnowledgeSearchResponse,
|
||||
)
|
||||
from app.services.knowledge_base_store import knowledge_base_store
|
||||
from app.services.knowledge_global_config_store import knowledge_global_config_store
|
||||
from app.services.knowledge_index import knowledge_index_service
|
||||
from app.services.openai_compat import normalize_openai_base_url
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _mask_api_key(value: Optional[str]) -> Optional[str]:
|
||||
if not value:
|
||||
return None
|
||||
if len(value) <= 8:
|
||||
return "*" * len(value)
|
||||
return f"{value[:4]}{'*' * (len(value) - 8)}{value[-4:]}"
|
||||
|
||||
|
||||
def _extract_upload_text(filename: str, content: bytes) -> str:
|
||||
lower = filename.lower()
|
||||
if lower.endswith((".txt", ".md", ".markdown", ".json", ".yaml", ".yml", ".log", ".xml", ".html", ".htm")):
|
||||
try:
|
||||
return content.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
return content.decode("utf-8", errors="ignore")
|
||||
if lower.endswith(".csv"):
|
||||
df = pd.read_csv(io.BytesIO(content))
|
||||
return df.to_csv(index=False)
|
||||
if lower.endswith((".xls", ".xlsx")):
|
||||
df = pd.read_excel(io.BytesIO(content))
|
||||
return df.to_csv(index=False)
|
||||
|
||||
# 增加对 PDF 的文本提取支持
|
||||
if lower.endswith(".pdf"):
|
||||
try:
|
||||
import PyPDF2
|
||||
pdf_reader = PyPDF2.PdfReader(io.BytesIO(content))
|
||||
text = []
|
||||
for page in pdf_reader.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text.append(page_text)
|
||||
return "\n".join(text)
|
||||
except ImportError:
|
||||
raise ValueError("PyPDF2 is not installed. Cannot parse PDF files.")
|
||||
except Exception as e:
|
||||
raise ValueError(f"Failed to parse PDF: {str(e)}")
|
||||
|
||||
# 增加对 Word 文档的文本提取支持
|
||||
if lower.endswith((".doc", ".docx")):
|
||||
try:
|
||||
import docx
|
||||
doc = docx.Document(io.BytesIO(content))
|
||||
return "\n".join([para.text for para in doc.paragraphs])
|
||||
except ImportError:
|
||||
raise ValueError("python-docx is not installed. Cannot parse Word files.")
|
||||
except Exception as e:
|
||||
raise ValueError(f"Failed to parse Word document: {str(e)}")
|
||||
|
||||
# 增加对 PPT 文档的文本提取支持
|
||||
if lower.endswith((".ppt", ".pptx")):
|
||||
try:
|
||||
import pptx
|
||||
prs = pptx.Presentation(io.BytesIO(content))
|
||||
text = []
|
||||
for slide in prs.slides:
|
||||
for shape in slide.shapes:
|
||||
if hasattr(shape, "text"):
|
||||
text.append(shape.text)
|
||||
return "\n".join(text)
|
||||
except ImportError:
|
||||
raise ValueError("python-pptx is not installed. Cannot parse PPT files.")
|
||||
except Exception as e:
|
||||
raise ValueError(f"Failed to parse PPT document: {str(e)}")
|
||||
|
||||
raise ValueError("Unsupported file type")
|
||||
|
||||
|
||||
@router.get("/knowledge-bases/global-config", response_model=KnowledgeGlobalConfig)
|
||||
def get_knowledge_global_config():
|
||||
config = knowledge_global_config_store.get()
|
||||
raw_api_key = config.get("api_key")
|
||||
return {
|
||||
"api_base": config.get("api_base"),
|
||||
"api_key": None,
|
||||
"api_key_masked": _mask_api_key(raw_api_key),
|
||||
"has_api_key": bool(raw_api_key),
|
||||
"default_embedding_model": config.get("default_embedding_model"),
|
||||
}
|
||||
|
||||
|
||||
@router.put("/knowledge-bases/global-config", response_model=KnowledgeGlobalConfig)
|
||||
def update_knowledge_global_config(payload: KnowledgeGlobalConfigUpdate):
|
||||
updated = knowledge_global_config_store.update(payload.model_dump(exclude_unset=True))
|
||||
raw_api_key = updated.get("api_key")
|
||||
return {
|
||||
"api_base": updated.get("api_base"),
|
||||
"api_key": None,
|
||||
"api_key_masked": _mask_api_key(raw_api_key),
|
||||
"has_api_key": bool(raw_api_key),
|
||||
"default_embedding_model": updated.get("default_embedding_model"),
|
||||
}
|
||||
|
||||
|
||||
@router.post("/knowledge-bases/global-config/test-connection", response_model=KnowledgeConnectionTestResponse)
|
||||
def test_knowledge_global_connection(payload: KnowledgeConnectionTestRequest):
|
||||
saved = knowledge_global_config_store.get()
|
||||
api_base = normalize_openai_base_url(payload.api_base or saved.get("api_base") or "")
|
||||
api_key = payload.api_key or saved.get("api_key")
|
||||
model_name = (payload.model_name or "").strip()
|
||||
|
||||
if not api_base:
|
||||
raise HTTPException(status_code=400, detail="API Base 未配置")
|
||||
if not api_key:
|
||||
raise HTTPException(status_code=400, detail="API Key 未配置")
|
||||
if not model_name:
|
||||
raise HTTPException(status_code=400, detail="测试连接必须显式填写向量模型名称")
|
||||
|
||||
if not api_base:
|
||||
raise HTTPException(status_code=400, detail="API Base 未配置")
|
||||
try:
|
||||
client = OpenAI(
|
||||
api_key=api_key,
|
||||
base_url=api_base,
|
||||
)
|
||||
embedding_resp = client.embeddings.create(
|
||||
model=model_name,
|
||||
input="connection test",
|
||||
)
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=400, detail=f"Embedding调用失败: {exc}")
|
||||
|
||||
dimension = None
|
||||
if getattr(embedding_resp, "data", None):
|
||||
first = embedding_resp.data[0]
|
||||
vector = getattr(first, "embedding", None)
|
||||
if isinstance(vector, list):
|
||||
dimension = len(vector)
|
||||
return {
|
||||
"success": True,
|
||||
"message": "连接成功,Embedding调用正常",
|
||||
"model_name": model_name,
|
||||
"embedding_dimension": dimension,
|
||||
"resolved_api_base": api_base,
|
||||
"available_models": [],
|
||||
}
|
||||
|
||||
|
||||
@router.get("/knowledge-bases", response_model=List[KnowledgeBase])
|
||||
def list_knowledge_bases(project_id: Optional[int] = None):
|
||||
return knowledge_base_store.list(project_id=project_id)
|
||||
|
||||
|
||||
@router.post("/knowledge-bases", response_model=KnowledgeBase)
|
||||
def create_knowledge_base(payload: KnowledgeBaseCreate):
|
||||
return knowledge_base_store.create(payload.model_dump())
|
||||
|
||||
|
||||
@router.get("/knowledge-bases/{kb_id}", response_model=KnowledgeBase)
|
||||
def get_knowledge_base(kb_id: str):
|
||||
kb = knowledge_base_store.get(kb_id)
|
||||
if not kb:
|
||||
raise HTTPException(status_code=404, detail="Knowledge base not found")
|
||||
return kb
|
||||
|
||||
|
||||
@router.put("/knowledge-bases/{kb_id}", response_model=KnowledgeBase)
|
||||
def update_knowledge_base(kb_id: str, payload: KnowledgeBaseUpdate):
|
||||
kb = knowledge_base_store.update(kb_id, payload.model_dump(exclude_unset=True))
|
||||
if not kb:
|
||||
raise HTTPException(status_code=404, detail="Knowledge base not found")
|
||||
return kb
|
||||
|
||||
|
||||
@router.delete("/knowledge-bases/{kb_id}")
|
||||
def delete_knowledge_base(kb_id: str):
|
||||
deleted = knowledge_base_store.delete(kb_id)
|
||||
if not deleted:
|
||||
raise HTTPException(status_code=404, detail="Knowledge base not found")
|
||||
return {"status": "success"}
|
||||
|
||||
|
||||
@router.get("/knowledge-bases/{kb_id}/documents", response_model=List[KnowledgeDocument])
|
||||
def list_knowledge_documents(kb_id: str):
|
||||
kb = knowledge_base_store.get(kb_id)
|
||||
if not kb:
|
||||
raise HTTPException(status_code=404, detail="Knowledge base not found")
|
||||
return kb.get("documents", [])
|
||||
|
||||
|
||||
@router.post("/knowledge-bases/{kb_id}/documents", response_model=KnowledgeDocument)
|
||||
def create_knowledge_document(kb_id: str, payload: KnowledgeDocumentCreate):
|
||||
doc = knowledge_base_store.create_document(kb_id=kb_id, payload=payload.model_dump())
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail="Knowledge base not found")
|
||||
return doc
|
||||
|
||||
|
||||
@router.put("/knowledge-bases/{kb_id}/documents/{doc_id}", response_model=KnowledgeDocument)
|
||||
def update_knowledge_document(kb_id: str, doc_id: str, payload: KnowledgeDocumentUpdate):
|
||||
doc = knowledge_base_store.update_document(
|
||||
kb_id=kb_id,
|
||||
doc_id=doc_id,
|
||||
payload=payload.model_dump(exclude_unset=True),
|
||||
)
|
||||
if not doc:
|
||||
raise HTTPException(status_code=404, detail="Knowledge document not found")
|
||||
return doc
|
||||
|
||||
|
||||
@router.delete("/knowledge-bases/{kb_id}/documents/{doc_id}")
|
||||
def delete_knowledge_document(kb_id: str, doc_id: str):
|
||||
deleted = knowledge_base_store.delete_document(kb_id=kb_id, doc_id=doc_id)
|
||||
if not deleted:
|
||||
raise HTTPException(status_code=404, detail="Knowledge document not found")
|
||||
return {"status": "success"}
|
||||
|
||||
|
||||
@router.post("/knowledge-bases/{kb_id}/reindex")
|
||||
def reindex_knowledge_base(kb_id: str):
|
||||
try:
|
||||
return knowledge_index_service.reindex(kb_id)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc))
|
||||
|
||||
|
||||
@router.post("/knowledge-bases/{kb_id}/search", response_model=KnowledgeSearchResponse)
|
||||
def search_knowledge_base(kb_id: str, payload: KnowledgeSearchRequest):
|
||||
try:
|
||||
result = knowledge_index_service.search(
|
||||
kb_id=kb_id,
|
||||
query=payload.query,
|
||||
top_k=payload.top_k,
|
||||
)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc))
|
||||
return result
|
||||
|
||||
|
||||
@router.post("/knowledge-bases/{kb_id}/documents/upload")
|
||||
async def upload_knowledge_documents(
|
||||
kb_id: str,
|
||||
files: List[UploadFile] = File(...),
|
||||
metadata: Optional[str] = Form(default=None),
|
||||
):
|
||||
kb = knowledge_base_store.get(kb_id)
|
||||
if not kb:
|
||||
raise HTTPException(status_code=404, detail="Knowledge base not found")
|
||||
metadata_payload: dict[str, Any] = {}
|
||||
if metadata:
|
||||
try:
|
||||
parsed_metadata = json.loads(metadata)
|
||||
if isinstance(parsed_metadata, dict):
|
||||
metadata_payload = parsed_metadata
|
||||
except json.JSONDecodeError:
|
||||
raise HTTPException(status_code=400, detail="metadata 必须是合法 JSON 对象")
|
||||
|
||||
created: List[dict[str, Any]] = []
|
||||
for file in files:
|
||||
filename = file.filename or "untitled"
|
||||
content = await file.read()
|
||||
if not content:
|
||||
continue
|
||||
# 将大小限制从 5MB 放宽到 15MB,以更好地支持带有图片的 PDF 文件
|
||||
if len(content) > 15 * 1024 * 1024:
|
||||
raise HTTPException(status_code=400, detail=f"文件过大 (超过 15MB): {filename}")
|
||||
try:
|
||||
text = _extract_upload_text(filename, content)
|
||||
except Exception:
|
||||
raise HTTPException(status_code=400, detail=f"不支持的文件类型: {filename}")
|
||||
doc = knowledge_base_store.create_document(
|
||||
kb_id=kb_id,
|
||||
payload={
|
||||
"title": filename,
|
||||
"content": text,
|
||||
"metadata": {**metadata_payload, "source": "upload", "filename": filename},
|
||||
},
|
||||
)
|
||||
if doc:
|
||||
created.append(doc)
|
||||
return {"status": "success", "count": len(created), "documents": created}
|
||||
@@ -0,0 +1,183 @@
|
||||
import json
|
||||
import os
|
||||
from typing import List, Optional, Dict, Any
|
||||
from fastapi import APIRouter, HTTPException, Depends, status
|
||||
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
||||
from jose import jwt, JWTError
|
||||
from pydantic import BaseModel, Field
|
||||
from app.core.security import SECRET_KEY, ALGORITHM
|
||||
from app.core.data_root import get_data_root
|
||||
from app.core.llm_provider import build_llm_provider
|
||||
|
||||
router = APIRouter()
|
||||
security = HTTPBearer()
|
||||
|
||||
DATA_FILE = str(get_data_root() / "llm_config.json")
|
||||
|
||||
|
||||
class CurrentUser(BaseModel):
|
||||
id: int
|
||||
username: str
|
||||
is_admin: bool = False
|
||||
|
||||
class LLMConfig(BaseModel):
|
||||
id: str = Field(..., description="Unique identifier for the LLM configuration")
|
||||
name: Optional[str] = Field(None, description="Display name")
|
||||
provider: str = Field(..., description="Provider name (e.g., openai, azure, anthropic)")
|
||||
model: str = Field(..., description="Model name (e.g., gpt-4, claude-3-opus)")
|
||||
api_key: Optional[str] = Field(None, description="API Key for the provider")
|
||||
api_base: Optional[str] = Field(None, description="Base URL for the API")
|
||||
extra_headers: Optional[Dict[str, str]] = Field(None, description="Extra headers for the request")
|
||||
is_active: bool = Field(True, description="Whether this configuration is active")
|
||||
|
||||
class LLMConfigCreate(BaseModel):
|
||||
id: str
|
||||
name: Optional[str] = None
|
||||
provider: str
|
||||
model: str
|
||||
api_key: Optional[str] = None
|
||||
api_base: Optional[str] = None
|
||||
extra_headers: Optional[Dict[str, str]] = None
|
||||
is_active: bool = True
|
||||
|
||||
class LLMConfigUpdate(BaseModel):
|
||||
name: Optional[str] = None
|
||||
provider: Optional[str] = None
|
||||
model: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
api_base: Optional[str] = None
|
||||
extra_headers: Optional[Dict[str, str]] = None
|
||||
is_active: Optional[bool] = None
|
||||
|
||||
class TestConnectionRequest(BaseModel):
|
||||
provider: str
|
||||
model: str
|
||||
api_key: Optional[str] = None
|
||||
api_base: Optional[str] = None
|
||||
extra_headers: Optional[Dict[str, str]] = None
|
||||
|
||||
|
||||
def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(security)) -> CurrentUser:
|
||||
unauthorized = HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Invalid authentication credentials",
|
||||
)
|
||||
try:
|
||||
payload = jwt.decode(credentials.credentials, SECRET_KEY, algorithms=[ALGORITHM])
|
||||
except JWTError:
|
||||
raise unauthorized
|
||||
user_id = payload.get("id")
|
||||
username = payload.get("sub")
|
||||
is_admin = bool(payload.get("is_admin", False))
|
||||
if user_id is None or username is None:
|
||||
raise unauthorized
|
||||
return CurrentUser(id=user_id, username=username, is_admin=is_admin)
|
||||
|
||||
|
||||
def get_admin_user(current_user: CurrentUser = Depends(get_current_user)) -> CurrentUser:
|
||||
if not current_user.is_admin:
|
||||
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Admin permission required")
|
||||
return current_user
|
||||
|
||||
def _load_data() -> List[Dict[str, Any]]:
|
||||
if not os.path.exists(DATA_FILE):
|
||||
return []
|
||||
try:
|
||||
with open(DATA_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
def _save_data(data: List[Dict[str, Any]]):
|
||||
os.makedirs(os.path.dirname(DATA_FILE), exist_ok=True)
|
||||
with open(DATA_FILE, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
|
||||
def _sanitize_config(item: Dict[str, Any], is_admin: bool) -> Dict[str, Any]:
|
||||
config = item.copy()
|
||||
if not is_admin:
|
||||
config["api_key"] = None
|
||||
return config
|
||||
|
||||
@router.get("/llm", response_model=List[LLMConfig])
|
||||
def list_llm_configs(current_user: CurrentUser = Depends(get_current_user)):
|
||||
data = _load_data()
|
||||
return [LLMConfig(**_sanitize_config(item, current_user.is_admin)) for item in data]
|
||||
|
||||
@router.get("/llm/{config_id}", response_model=LLMConfig)
|
||||
def get_llm_config(config_id: str, current_user: CurrentUser = Depends(get_current_user)):
|
||||
data = _load_data()
|
||||
for item in data:
|
||||
if item["id"] == config_id:
|
||||
return LLMConfig(**_sanitize_config(item, current_user.is_admin))
|
||||
raise HTTPException(status_code=404, detail="LLM configuration not found")
|
||||
|
||||
@router.post("/llm", response_model=LLMConfig)
|
||||
def create_llm_config(config: LLMConfigCreate, _: CurrentUser = Depends(get_admin_user)):
|
||||
data = _load_data()
|
||||
if any(item["id"] == config.id for item in data):
|
||||
raise HTTPException(status_code=400, detail="LLM configuration with this ID already exists")
|
||||
|
||||
new_config = config.dict()
|
||||
if new_config.get("is_active"):
|
||||
for item in data:
|
||||
item["is_active"] = False
|
||||
data.append(new_config)
|
||||
_save_data(data)
|
||||
return LLMConfig(**new_config)
|
||||
|
||||
@router.put("/llm/{config_id}", response_model=LLMConfig)
|
||||
def update_llm_config(config_id: str, config: LLMConfigUpdate, _: CurrentUser = Depends(get_admin_user)):
|
||||
data = _load_data()
|
||||
for i, item in enumerate(data):
|
||||
if item["id"] == config_id:
|
||||
updated_item = item.copy()
|
||||
update_data = config.dict(exclude_unset=True)
|
||||
if update_data.get("is_active"):
|
||||
for j in range(len(data)):
|
||||
data[j]["is_active"] = False
|
||||
updated_item.update(update_data)
|
||||
data[i] = updated_item
|
||||
_save_data(data)
|
||||
return LLMConfig(**updated_item)
|
||||
raise HTTPException(status_code=404, detail="LLM configuration not found")
|
||||
|
||||
@router.delete("/llm/{config_id}")
|
||||
def delete_llm_config(config_id: str, _: CurrentUser = Depends(get_admin_user)):
|
||||
data = _load_data()
|
||||
initial_len = len(data)
|
||||
data = [item for item in data if item["id"] != config_id]
|
||||
if len(data) == initial_len:
|
||||
raise HTTPException(status_code=404, detail="LLM configuration not found")
|
||||
_save_data(data)
|
||||
return {"message": "LLM configuration deleted successfully"}
|
||||
|
||||
@router.post("/llm/test")
|
||||
async def test_connection(request: TestConnectionRequest, _: CurrentUser = Depends(get_admin_user)):
|
||||
try:
|
||||
provider = build_llm_provider(
|
||||
model=request.model.strip(),
|
||||
provider=request.provider,
|
||||
api_key=request.api_key,
|
||||
api_base=request.api_base,
|
||||
extra_headers=request.extra_headers,
|
||||
)
|
||||
response = await provider.chat(
|
||||
messages=[{"role": "user", "content": "Hello"}],
|
||||
max_tokens=5,
|
||||
temperature=0,
|
||||
)
|
||||
if response.finish_reason == "error":
|
||||
raise ValueError(response.content or "Unknown provider error")
|
||||
return {
|
||||
"success": True,
|
||||
"message": "Connection successful",
|
||||
"details": {
|
||||
"content": response.content,
|
||||
"finish_reason": response.finish_reason,
|
||||
"usage": response.usage,
|
||||
},
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"Connection failed: {str(e)}")
|
||||
@@ -0,0 +1,135 @@
|
||||
import json
|
||||
import uuid
|
||||
import asyncio
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
from contextlib import AsyncExitStack
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Depends
|
||||
from pydantic import BaseModel
|
||||
from mcp import ClientSession, StdioServerParameters
|
||||
from mcp.client.stdio import stdio_client
|
||||
from mcp.client.sse import sse_client
|
||||
|
||||
from app.schemas.mcp import MCPServer, MCPServerCreate, MCPServerUpdate
|
||||
from app.core.data_root import get_data_root
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
def get_mcp_servers_file() -> Path:
|
||||
return get_data_root() / "mcp_servers.json"
|
||||
|
||||
def read_mcp_servers() -> List[dict]:
|
||||
file_path = get_mcp_servers_file()
|
||||
if not file_path.exists():
|
||||
return []
|
||||
try:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
|
||||
def write_mcp_servers(servers: List[dict]) -> None:
|
||||
file_path = get_mcp_servers_file()
|
||||
with open(file_path, "w", encoding="utf-8") as f:
|
||||
json.dump(servers, f, indent=2, ensure_ascii=False)
|
||||
|
||||
async def _check_single_mcp_health(server: dict) -> str:
|
||||
try:
|
||||
async with AsyncExitStack() as stack:
|
||||
server_type = server.get("type")
|
||||
if server_type == "stdio":
|
||||
params = StdioServerParameters(
|
||||
command=server.get("command", ""),
|
||||
args=server.get("args", []),
|
||||
env=server.get("env")
|
||||
)
|
||||
read, write = await stack.enter_async_context(stdio_client(params))
|
||||
elif server_type in ["sse", "streamableHttp"]:
|
||||
read, write = await stack.enter_async_context(sse_client(server.get("url", "")))
|
||||
else:
|
||||
return "error: unsupported type"
|
||||
|
||||
session = await stack.enter_async_context(ClientSession(read, write))
|
||||
await asyncio.wait_for(session.initialize(), timeout=5.0)
|
||||
return "connected"
|
||||
except Exception as e:
|
||||
err_msg = str(e)
|
||||
if "unhandled errors in a TaskGroup" in err_msg:
|
||||
return "error: connection refused"
|
||||
return f"error: {err_msg or 'unknown'}"
|
||||
|
||||
@router.get("/mcp", response_model=List[MCPServer])
|
||||
async def list_mcp_servers(project_id: Optional[int] = None):
|
||||
servers = read_mcp_servers()
|
||||
if project_id is not None:
|
||||
servers = [s for s in servers if s.get("project_id") == project_id]
|
||||
|
||||
if not servers:
|
||||
return []
|
||||
|
||||
tasks = [_check_single_mcp_health(s) for s in servers]
|
||||
statuses = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
needs_update = False
|
||||
for server, status in zip(servers, statuses):
|
||||
new_status = status if isinstance(status, str) else f"error: {str(status)}"
|
||||
if server.get("status") != new_status:
|
||||
server["status"] = new_status
|
||||
needs_update = True
|
||||
|
||||
if needs_update:
|
||||
# Write back to persist the new statuses
|
||||
all_servers = read_mcp_servers()
|
||||
for s in all_servers:
|
||||
for checked_s in servers:
|
||||
if s.get("id") == checked_s.get("id"):
|
||||
s["status"] = checked_s["status"]
|
||||
write_mcp_servers(all_servers)
|
||||
|
||||
return servers
|
||||
|
||||
@router.post("/mcp", response_model=MCPServer)
|
||||
def create_mcp_server(server_in: MCPServerCreate):
|
||||
servers = read_mcp_servers()
|
||||
|
||||
server_data = server_in.dict()
|
||||
server_data["id"] = str(uuid.uuid4())
|
||||
if "status" not in server_data or not server_data["status"]:
|
||||
server_data["status"] = "disconnected"
|
||||
|
||||
servers.append(server_data)
|
||||
write_mcp_servers(servers)
|
||||
return server_data
|
||||
|
||||
@router.get("/mcp/{server_id}", response_model=MCPServer)
|
||||
def get_mcp_server(server_id: str):
|
||||
servers = read_mcp_servers()
|
||||
for server in servers:
|
||||
if server.get("id") == server_id:
|
||||
return server
|
||||
raise HTTPException(status_code=404, detail="MCP Server not found")
|
||||
|
||||
@router.put("/mcp/{server_id}", response_model=MCPServer)
|
||||
def update_mcp_server(server_id: str, server_in: MCPServerUpdate):
|
||||
servers = read_mcp_servers()
|
||||
for i, server in enumerate(servers):
|
||||
if server.get("id") == server_id:
|
||||
update_data = server_in.dict(exclude_unset=True)
|
||||
for key, value in update_data.items():
|
||||
server[key] = value
|
||||
servers[i] = server
|
||||
write_mcp_servers(servers)
|
||||
return server
|
||||
raise HTTPException(status_code=404, detail="MCP Server not found")
|
||||
|
||||
@router.delete("/mcp/{server_id}")
|
||||
def delete_mcp_server(server_id: str):
|
||||
servers = read_mcp_servers()
|
||||
filtered_servers = [s for s in servers if s.get("id") != server_id]
|
||||
|
||||
if len(servers) == len(filtered_servers):
|
||||
raise HTTPException(status_code=404, detail="MCP Server not found")
|
||||
|
||||
write_mcp_servers(filtered_servers)
|
||||
return {"status": "success"}
|
||||
@@ -0,0 +1,92 @@
|
||||
from typing import List
|
||||
from fastapi import APIRouter, HTTPException, Depends, status
|
||||
from sqlalchemy.orm import Session
|
||||
from app.database import get_db
|
||||
from app.models.project import Project
|
||||
from app.schemas.project import ProjectCreate, ProjectUpdate, Project as ProjectSchema
|
||||
from app.core.security import get_current_user, CurrentUser
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/projects", response_model=List[ProjectSchema])
|
||||
def list_projects(
|
||||
skip: int = 0,
|
||||
limit: int = 100,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
# Users can only see their own projects, unless they are admin (who can see all?)
|
||||
# For simplicity, let's allow users to see their own projects.
|
||||
query = db.query(Project)
|
||||
if not current_user.is_admin:
|
||||
query = query.filter(Project.owner_id == current_user.id)
|
||||
|
||||
projects = query.offset(skip).limit(limit).all()
|
||||
return projects
|
||||
|
||||
@router.post("/projects", response_model=ProjectSchema)
|
||||
def create_project(
|
||||
project: ProjectCreate,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
db_project = Project(**project.dict(), owner_id=current_user.id)
|
||||
db.add(db_project)
|
||||
db.commit()
|
||||
db.refresh(db_project)
|
||||
return db_project
|
||||
|
||||
@router.get("/projects/{project_id}", response_model=ProjectSchema)
|
||||
def read_project(
|
||||
project_id: int,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
db_project = db.query(Project).filter(Project.id == project_id).first()
|
||||
if db_project is None:
|
||||
raise HTTPException(status_code=404, detail="Project not found")
|
||||
|
||||
if not current_user.is_admin and db_project.owner_id != current_user.id:
|
||||
raise HTTPException(status_code=403, detail="Not enough permissions")
|
||||
|
||||
return db_project
|
||||
|
||||
@router.put("/projects/{project_id}", response_model=ProjectSchema)
|
||||
def update_project(
|
||||
project_id: int,
|
||||
project: ProjectUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
db_project = db.query(Project).filter(Project.id == project_id).first()
|
||||
if db_project is None:
|
||||
raise HTTPException(status_code=404, detail="Project not found")
|
||||
|
||||
if not current_user.is_admin and db_project.owner_id != current_user.id:
|
||||
raise HTTPException(status_code=403, detail="Not enough permissions")
|
||||
|
||||
project_data = project.dict(exclude_unset=True)
|
||||
for key, value in project_data.items():
|
||||
setattr(db_project, key, value)
|
||||
|
||||
db.add(db_project)
|
||||
db.commit()
|
||||
db.refresh(db_project)
|
||||
return db_project
|
||||
|
||||
@router.delete("/projects/{project_id}")
|
||||
def delete_project(
|
||||
project_id: int,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
db_project = db.query(Project).filter(Project.id == project_id).first()
|
||||
if db_project is None:
|
||||
raise HTTPException(status_code=404, detail="Project not found")
|
||||
|
||||
if not current_user.is_admin and db_project.owner_id != current_user.id:
|
||||
raise HTTPException(status_code=403, detail="Not enough permissions")
|
||||
|
||||
db.delete(db_project)
|
||||
db.commit()
|
||||
return {"status": "success"}
|
||||
@@ -0,0 +1,146 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sqlalchemy.orm import Session
|
||||
from typing import Dict, Any, List, Optional
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.database import get_db
|
||||
from app.models.datasource import DataSource
|
||||
from app.schemas.mdl import MDLManifest
|
||||
from app.services.mdl import MDLService
|
||||
from app.connectors.factory import get_connector
|
||||
|
||||
router = APIRouter(tags=["semantic"])
|
||||
|
||||
class GenerateMDLRequest(BaseModel):
|
||||
selected_tables: Optional[List[str]] = None
|
||||
selected_columns: Optional[Dict[str, List[str]]] = None
|
||||
|
||||
class ModelDetailResponse(BaseModel):
|
||||
model: Dict[str, Any]
|
||||
relationships: List[Dict[str, Any]]
|
||||
preview_rows: List[Dict[str, Any]]
|
||||
|
||||
def _normalize_query_result(results: Any) -> List[Dict[str, Any]]:
|
||||
if isinstance(results, list):
|
||||
if results and isinstance(results[0], dict):
|
||||
return results
|
||||
if results and isinstance(results[0], (list, tuple)):
|
||||
return [dict(enumerate(row)) for row in results]
|
||||
return []
|
||||
if isinstance(results, tuple) and len(results) == 2:
|
||||
rows, cols = results
|
||||
col_names = [c[0] for c in cols]
|
||||
return [dict(zip(col_names, row)) for row in rows]
|
||||
return []
|
||||
|
||||
@router.get("/semantic/{datasource_id}/schema", response_model=Dict[str, List[Dict[str, str]]])
|
||||
def get_semantic_schema(datasource_id: int, db: Session = Depends(get_db)):
|
||||
# Check if datasource exists
|
||||
ds = db.query(DataSource).filter(DataSource.id == datasource_id).first()
|
||||
if not ds:
|
||||
raise HTTPException(status_code=404, detail="DataSource not found")
|
||||
|
||||
try:
|
||||
raw_schema = MDLService.get_raw_schema(ds)
|
||||
result = {}
|
||||
for table, data in raw_schema.items():
|
||||
if isinstance(data, dict) and "columns" in data:
|
||||
result[table] = data["columns"]
|
||||
elif isinstance(data, list):
|
||||
result[table] = data
|
||||
return result
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.get("/semantic/{datasource_id}", response_model=MDLManifest)
|
||||
def get_semantic_model(datasource_id: int, db: Session = Depends(get_db)):
|
||||
# Check if datasource exists
|
||||
ds = db.query(DataSource).filter(DataSource.id == datasource_id).first()
|
||||
if not ds:
|
||||
raise HTTPException(status_code=404, detail="DataSource not found")
|
||||
|
||||
# Get or generate MDL
|
||||
try:
|
||||
mdl = MDLService.get_or_create_mdl(datasource_id)
|
||||
return mdl
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.put("/semantic/{datasource_id}", response_model=MDLManifest)
|
||||
def update_semantic_model(datasource_id: int, mdl: MDLManifest, db: Session = Depends(get_db)):
|
||||
# Check if datasource exists
|
||||
ds = db.query(DataSource).filter(DataSource.id == datasource_id).first()
|
||||
if not ds:
|
||||
raise HTTPException(status_code=404, detail="DataSource not found")
|
||||
|
||||
try:
|
||||
MDLService.save_mdl(datasource_id, mdl)
|
||||
return mdl
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/semantic/{datasource_id}/generate", response_model=MDLManifest)
|
||||
def regenerate_semantic_model(datasource_id: int, request: Optional[GenerateMDLRequest] = None, db: Session = Depends(get_db)):
|
||||
ds = db.query(DataSource).filter(DataSource.id == datasource_id).first()
|
||||
if not ds:
|
||||
raise HTTPException(status_code=404, detail="DataSource not found")
|
||||
|
||||
try:
|
||||
selected_tables = request.selected_tables if request else None
|
||||
selected_columns = request.selected_columns if request else None
|
||||
mdl = MDLService.generate_default_mdl(
|
||||
ds,
|
||||
selected_tables=selected_tables,
|
||||
selected_columns=selected_columns,
|
||||
)
|
||||
MDLService.save_mdl(datasource_id, mdl)
|
||||
return mdl
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.get("/semantic/{datasource_id}/models/{model_name}", response_model=ModelDetailResponse)
|
||||
def get_model_detail(datasource_id: int, model_name: str, limit: int = 10, db: Session = Depends(get_db)):
|
||||
ds = db.query(DataSource).filter(DataSource.id == datasource_id).first()
|
||||
if not ds:
|
||||
raise HTTPException(status_code=404, detail="DataSource not found")
|
||||
|
||||
mdl = MDLService.get_or_create_mdl(datasource_id)
|
||||
model = next((m for m in mdl.models if m.name == model_name), None)
|
||||
if not model:
|
||||
raise HTTPException(status_code=404, detail="Model not found")
|
||||
|
||||
relationships = [
|
||||
{
|
||||
"name": rel.name,
|
||||
"models": rel.models,
|
||||
"joinType": rel.joinType,
|
||||
"condition": rel.condition,
|
||||
"properties": rel.properties,
|
||||
}
|
||||
for rel in mdl.relationships
|
||||
if model_name in rel.models
|
||||
]
|
||||
|
||||
preview_rows: List[Dict[str, Any]] = []
|
||||
try:
|
||||
connector = get_connector(ds)
|
||||
table_name = model.tableReference.table if model.tableReference else model.name
|
||||
query = f'SELECT * FROM "{table_name}" LIMIT {max(1, min(limit, 100))}'
|
||||
raw = connector.execute_query(query)
|
||||
preview_rows = _normalize_query_result(raw)
|
||||
except Exception:
|
||||
preview_rows = []
|
||||
|
||||
model_payload = {
|
||||
"name": model.name,
|
||||
"tableReference": model.tableReference.model_dump(by_alias=True) if model.tableReference else None,
|
||||
"primaryKey": model.primaryKey,
|
||||
"properties": model.properties,
|
||||
"columns": [c.model_dump(by_alias=True) for c in model.columns],
|
||||
}
|
||||
|
||||
return ModelDetailResponse(
|
||||
model=model_payload,
|
||||
relationships=relationships,
|
||||
preview_rows=preview_rows,
|
||||
)
|
||||
@@ -0,0 +1,562 @@
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import zipfile
|
||||
import tarfile
|
||||
import re
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any
|
||||
from datetime import datetime
|
||||
from fastapi import APIRouter, HTTPException, UploadFile, File, Form
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from app.core.data_root import get_data_root, get_workspace_root
|
||||
from nanobot.agent.skills import BUILTIN_SKILLS_DIR as NANOBOT_BUILTIN_SKILLS_DIR
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
DATA_FILE = str(get_data_root() / "skills.json")
|
||||
SKILL_HUB_DIR = str(get_workspace_root() / "skills")
|
||||
BACKEND_BUILTIN_SKILLS_DIR = str(Path(__file__).resolve().parents[1] / "skills_builtin")
|
||||
|
||||
SOURCE_LOCAL_IMPORT = "local_import"
|
||||
SOURCE_SYSTEM_BUILTIN = "system_builtin"
|
||||
SOURCE_BACKEND_GENERATED = "backend_generated"
|
||||
SOURCE_UPLOADED_FILE = "uploaded_file"
|
||||
|
||||
STATUS_SAFE = "safe"
|
||||
STATUS_LOW_RISK = "low_risk"
|
||||
|
||||
_SOURCE_ALIASES = {
|
||||
SOURCE_LOCAL_IMPORT: SOURCE_LOCAL_IMPORT,
|
||||
"本地导入": SOURCE_LOCAL_IMPORT,
|
||||
"Local Import": SOURCE_LOCAL_IMPORT,
|
||||
SOURCE_SYSTEM_BUILTIN: SOURCE_SYSTEM_BUILTIN,
|
||||
"系统内置": SOURCE_SYSTEM_BUILTIN,
|
||||
"System Built-in": SOURCE_SYSTEM_BUILTIN,
|
||||
SOURCE_BACKEND_GENERATED: SOURCE_BACKEND_GENERATED,
|
||||
"后台生成": SOURCE_BACKEND_GENERATED,
|
||||
"Backend Generated": SOURCE_BACKEND_GENERATED,
|
||||
SOURCE_UPLOADED_FILE: SOURCE_UPLOADED_FILE,
|
||||
"文件上传": SOURCE_UPLOADED_FILE,
|
||||
"File Upload": SOURCE_UPLOADED_FILE,
|
||||
}
|
||||
|
||||
_STATUS_ALIASES = {
|
||||
STATUS_SAFE: STATUS_SAFE,
|
||||
"安全": STATUS_SAFE,
|
||||
"Safe": STATUS_SAFE,
|
||||
STATUS_LOW_RISK: STATUS_LOW_RISK,
|
||||
"低风险": STATUS_LOW_RISK,
|
||||
"Low Risk": STATUS_LOW_RISK,
|
||||
}
|
||||
|
||||
|
||||
def _normalize_source(value: Optional[str]) -> str:
|
||||
if not value:
|
||||
return SOURCE_LOCAL_IMPORT
|
||||
return _SOURCE_ALIASES.get(value, value)
|
||||
|
||||
|
||||
def _normalize_status(value: Optional[str]) -> str:
|
||||
if not value:
|
||||
return STATUS_SAFE
|
||||
return _STATUS_ALIASES.get(value, value)
|
||||
|
||||
def _ensure_skill_hub_dir() -> None:
|
||||
os.makedirs(SKILL_HUB_DIR, exist_ok=True)
|
||||
|
||||
class Skill(BaseModel):
|
||||
id: str = Field(..., description="Unique identifier for the skill")
|
||||
name: str = Field(..., description="Name of the skill")
|
||||
description: Optional[str] = Field(None, description="Description of what the skill does")
|
||||
content: str = Field(..., description="The content/prompt/logic of the skill")
|
||||
type: str = Field("python", description="Type of the skill (python, sql, api)")
|
||||
project_id: Optional[int] = Field(None, description="The ID of the project this skill belongs to")
|
||||
source: str = Field(SOURCE_LOCAL_IMPORT, description="Stable source key of the skill")
|
||||
installation_time: str = Field(default_factory=lambda: datetime.now().strftime("%Y年%m月%d日"), description="Time when the skill was installed")
|
||||
status: str = Field(STATUS_SAFE, description="Stable security status key")
|
||||
file_path: Optional[str] = Field(None, description="Path to the skill folder in skill-hub")
|
||||
is_builtin: bool = Field(False, description="Whether this is a system builtin skill")
|
||||
|
||||
class SkillCreate(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
content: str
|
||||
type: str = "python"
|
||||
project_id: Optional[int] = None
|
||||
source: str = SOURCE_LOCAL_IMPORT
|
||||
installation_time: Optional[str] = None
|
||||
status: str = STATUS_SAFE
|
||||
file_path: Optional[str] = None
|
||||
|
||||
class SkillUpdate(BaseModel):
|
||||
name: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
content: Optional[str] = None
|
||||
type: Optional[str] = None
|
||||
project_id: Optional[int] = None
|
||||
source: Optional[str] = None
|
||||
installation_time: Optional[str] = None
|
||||
status: Optional[str] = None
|
||||
file_path: Optional[str] = None
|
||||
|
||||
def _parse_skill_md(file_path: str) -> Dict[str, Any]:
|
||||
"""Parse SKILL.md for metadata and content according to agentskills.io standard."""
|
||||
if not os.path.exists(file_path):
|
||||
return {}
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
except Exception as e:
|
||||
print(f"Error reading {file_path}: {e}")
|
||||
return {}
|
||||
|
||||
# Split YAML frontmatter and Markdown body
|
||||
# Support both --- and +++ for frontmatter
|
||||
metadata = {}
|
||||
body = content
|
||||
|
||||
if content.startswith('---'):
|
||||
parts = content.split('---', 2)
|
||||
if len(parts) >= 3:
|
||||
try:
|
||||
metadata = yaml.safe_load(parts[1]) or {}
|
||||
body = parts[2].strip()
|
||||
except Exception as e:
|
||||
print(f"Error parsing YAML frontmatter: {e}")
|
||||
|
||||
# Extract name and description, fallback to some defaults
|
||||
name = metadata.get("name")
|
||||
description = metadata.get("description")
|
||||
|
||||
# If name not in metadata, try to find the first H1 in markdown body
|
||||
if not name:
|
||||
for line in body.split('\n'):
|
||||
if line.startswith('# '):
|
||||
name = line[2:].strip()
|
||||
break
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"description": description,
|
||||
"content": body,
|
||||
"metadata": metadata
|
||||
}
|
||||
|
||||
def _load_data() -> List[Dict[str, Any]]:
|
||||
if not os.path.exists(DATA_FILE):
|
||||
return []
|
||||
try:
|
||||
with open(DATA_FILE, "r") as f:
|
||||
return json.load(f)
|
||||
except (json.JSONDecodeError, FileNotFoundError):
|
||||
return []
|
||||
|
||||
def _save_data(data: List[Dict[str, Any]]):
|
||||
os.makedirs(os.path.dirname(DATA_FILE), exist_ok=True)
|
||||
with open(DATA_FILE, "w") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
def _dedupe_skills(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
deduped: Dict[str, Dict[str, Any]] = {}
|
||||
for item in data:
|
||||
skill_id = str(item.get("id") or "").strip()
|
||||
project_id = item.get("project_id")
|
||||
if not skill_id:
|
||||
continue
|
||||
|
||||
# Use a composite key of (id, project_id) for deduplication
|
||||
# so that different projects can theoretically have the same skill_id
|
||||
dedupe_key = f"{skill_id}_{project_id}"
|
||||
|
||||
existing = deduped.get(dedupe_key)
|
||||
if existing is None:
|
||||
deduped[dedupe_key] = item
|
||||
continue
|
||||
|
||||
# If they somehow have the exact same dedupe_key, we just keep the later one
|
||||
deduped[dedupe_key] = item
|
||||
|
||||
return list(deduped.values())
|
||||
|
||||
def _safe_skill_dir_name(value: str) -> str:
|
||||
safe = re.sub(r'[^a-zA-Z0-9_\-]', '_', value or "").lower()
|
||||
return safe or "skill"
|
||||
|
||||
def _write_skill_markdown(skill_dir: str, skill_name: str, description: Optional[str], content: str) -> str:
|
||||
os.makedirs(skill_dir, exist_ok=True)
|
||||
skill_md_path = os.path.join(skill_dir, "SKILL.md")
|
||||
final_description = description or "No description provided"
|
||||
body = content or ""
|
||||
markdown = (
|
||||
f"---\n"
|
||||
f"name: {skill_name}\n"
|
||||
f"description: {final_description}\n"
|
||||
f"---\n\n"
|
||||
f"{body}\n"
|
||||
)
|
||||
with open(skill_md_path, "w", encoding="utf-8") as f:
|
||||
f.write(markdown)
|
||||
return skill_md_path
|
||||
|
||||
def _scan_builtin_skills(data: List[Dict[str, Any]], registered_paths: set, source_dir: str, source_name: str):
|
||||
if not os.path.exists(source_dir):
|
||||
return
|
||||
for item in os.listdir(source_dir):
|
||||
skill_dir = os.path.abspath(os.path.join(source_dir, item))
|
||||
if os.path.isdir(skill_dir):
|
||||
skill_md_path = os.path.join(skill_dir, "SKILL.md")
|
||||
if os.path.exists(skill_md_path):
|
||||
metadata_res = _parse_skill_md(skill_md_path)
|
||||
skill_name = metadata_res.get("name") or item
|
||||
|
||||
existing = None
|
||||
for d in data:
|
||||
if (d.get("id") == item and d.get("is_builtin")) or d.get("file_path") == skill_dir:
|
||||
existing = d
|
||||
break
|
||||
|
||||
if existing:
|
||||
existing["name"] = skill_name
|
||||
existing["description"] = metadata_res.get("description") or "No description provided"
|
||||
existing["content"] = metadata_res.get("content") or ""
|
||||
existing["file_path"] = skill_dir
|
||||
existing["is_builtin"] = True
|
||||
existing["source"] = source_name
|
||||
existing["status"] = STATUS_SAFE
|
||||
registered_paths.add(skill_dir)
|
||||
else:
|
||||
new_skill = {
|
||||
"id": item,
|
||||
"name": skill_name,
|
||||
"description": metadata_res.get("description") or "No description provided",
|
||||
"content": metadata_res.get("content") or "",
|
||||
"type": "agentskill",
|
||||
"project_id": None,
|
||||
"source": source_name,
|
||||
"installation_time": datetime.now().strftime("%Y年%m月%d日"),
|
||||
"status": STATUS_SAFE,
|
||||
"file_path": skill_dir,
|
||||
"is_builtin": True
|
||||
}
|
||||
data.append(new_skill)
|
||||
registered_paths.add(skill_dir)
|
||||
|
||||
def load_skills(project_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
_ensure_skill_hub_dir()
|
||||
data = _load_data()
|
||||
|
||||
registered_paths = set()
|
||||
|
||||
# Sync registered skills with their SKILL.md if available
|
||||
for item in data:
|
||||
item["source"] = _normalize_source(item.get("source"))
|
||||
item["status"] = _normalize_status(item.get("status"))
|
||||
if item.get("id") in ("nl2sql", "visualization") or item.get("is_builtin"):
|
||||
item["is_builtin"] = True
|
||||
else:
|
||||
item.setdefault("is_builtin", False)
|
||||
|
||||
if item.get("file_path"):
|
||||
abs_path = os.path.abspath(item["file_path"])
|
||||
registered_paths.add(abs_path)
|
||||
skill_md_path = os.path.join(abs_path, "SKILL.md")
|
||||
if os.path.exists(skill_md_path):
|
||||
metadata_res = _parse_skill_md(skill_md_path)
|
||||
if metadata_res.get("name"):
|
||||
item["name"] = metadata_res["name"]
|
||||
if metadata_res.get("description"):
|
||||
item["description"] = metadata_res["description"]
|
||||
if metadata_res.get("content"):
|
||||
item["content"] = metadata_res["content"]
|
||||
|
||||
# Scan builtin skills
|
||||
_scan_builtin_skills(data, registered_paths, NANOBOT_BUILTIN_SKILLS_DIR, SOURCE_SYSTEM_BUILTIN)
|
||||
_scan_builtin_skills(data, registered_paths, BACKEND_BUILTIN_SKILLS_DIR, SOURCE_SYSTEM_BUILTIN)
|
||||
|
||||
# Scan for unregistered skills in SKILL_HUB_DIR (1-level deep to match nanobot's behavior)
|
||||
if os.path.exists(SKILL_HUB_DIR):
|
||||
for item in os.listdir(SKILL_HUB_DIR):
|
||||
skill_dir = os.path.abspath(os.path.join(SKILL_HUB_DIR, item))
|
||||
if os.path.isdir(skill_dir):
|
||||
skill_md_path = os.path.join(skill_dir, "SKILL.md")
|
||||
if os.path.exists(skill_md_path) and skill_dir not in registered_paths:
|
||||
metadata_res = _parse_skill_md(skill_md_path)
|
||||
skill_name = metadata_res.get("name") or item
|
||||
|
||||
# Try to deduce project_id from directory prefix (e.g., p123_skillname)
|
||||
deduced_project_id = None
|
||||
match = re.match(r'^p(\d+)_', item)
|
||||
if match:
|
||||
deduced_project_id = int(match.group(1))
|
||||
|
||||
new_skill = {
|
||||
"id": item,
|
||||
"name": skill_name,
|
||||
"description": metadata_res.get("description") or "No description provided",
|
||||
"content": metadata_res.get("content") or "",
|
||||
"type": "agentskill",
|
||||
"project_id": deduced_project_id,
|
||||
"source": SOURCE_BACKEND_GENERATED,
|
||||
"installation_time": datetime.now().strftime("%Y年%m月%d日"),
|
||||
"status": STATUS_SAFE,
|
||||
"file_path": skill_dir,
|
||||
"is_builtin": item in ("nl2sql", "visualization")
|
||||
}
|
||||
data.append(new_skill)
|
||||
registered_paths.add(skill_dir)
|
||||
|
||||
deduped = _dedupe_skills(data)
|
||||
if project_id is not None:
|
||||
return [item for item in deduped if item.get("project_id") == project_id or item.get("project_id") is None]
|
||||
return deduped
|
||||
|
||||
@router.get("/skills", response_model=List[Skill])
|
||||
def list_skills(project_id: Optional[int] = None):
|
||||
data = load_skills(project_id)
|
||||
return [Skill(**item) for item in data]
|
||||
|
||||
@router.get("/skills/{skill_id}", response_model=Skill)
|
||||
def get_skill(skill_id: str, project_id: Optional[int] = None):
|
||||
data = load_skills()
|
||||
for item in data:
|
||||
if item["id"] == skill_id:
|
||||
if project_id is not None and item.get("project_id") != project_id:
|
||||
continue
|
||||
return Skill(**item)
|
||||
raise HTTPException(status_code=404, detail="Skill not found")
|
||||
|
||||
@router.post("/skills/upload")
|
||||
async def upload_skill(
|
||||
file: UploadFile = File(...),
|
||||
project_id: Optional[int] = Form(None)
|
||||
):
|
||||
"""Upload a skill file (SKILL.md) or a packaged skill (zip/tar.gz)."""
|
||||
filename = file.filename
|
||||
print(f"Uploading skill: {filename}, project_id: {project_id}")
|
||||
_ensure_skill_hub_dir()
|
||||
|
||||
# Create a unique temp directory
|
||||
temp_dir_name = f"temp_{datetime.now().timestamp()}_{os.urandom(4).hex()}"
|
||||
temp_dir = os.path.join(SKILL_HUB_DIR, temp_dir_name)
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
file_path = os.path.join(temp_dir, filename)
|
||||
with open(file_path, "wb") as buffer:
|
||||
shutil.copyfileobj(file.file, buffer)
|
||||
|
||||
skill_source_dir = None
|
||||
|
||||
# Handle different file types
|
||||
if filename.endswith(".zip"):
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as zip_ref:
|
||||
zip_ref.extractall(temp_dir)
|
||||
os.remove(file_path)
|
||||
# Find the directory containing SKILL.md
|
||||
for root, dirs, files in os.walk(temp_dir):
|
||||
if "SKILL.md" in files:
|
||||
skill_source_dir = root
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Zip extraction failed: {e}")
|
||||
raise HTTPException(status_code=400, detail=f"Failed to extract zip: {str(e)}")
|
||||
|
||||
elif filename.endswith((".tar.gz", ".tgz")):
|
||||
try:
|
||||
with tarfile.open(file_path, 'r:gz') as tar_ref:
|
||||
tar_ref.extractall(temp_dir)
|
||||
os.remove(file_path)
|
||||
for root, dirs, files in os.walk(temp_dir):
|
||||
if "SKILL.md" in files:
|
||||
skill_source_dir = root
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Tarball extraction failed: {e}")
|
||||
raise HTTPException(status_code=400, detail=f"Failed to extract tarball: {str(e)}")
|
||||
|
||||
elif filename == "SKILL.md":
|
||||
skill_source_dir = temp_dir
|
||||
else:
|
||||
print(f"Unsupported file type: {filename}")
|
||||
raise HTTPException(status_code=400, detail="Only SKILL.md or packaged skills (zip/tar.gz) are supported")
|
||||
|
||||
if not skill_source_dir or not os.path.exists(os.path.join(skill_source_dir, "SKILL.md")):
|
||||
print(f"SKILL.md not found in {filename}")
|
||||
raise HTTPException(status_code=400, detail="SKILL.md not found in the uploaded file")
|
||||
|
||||
# Parse metadata
|
||||
skill_md_path = os.path.join(skill_source_dir, "SKILL.md")
|
||||
metadata_res = _parse_skill_md(skill_md_path)
|
||||
|
||||
# Use metadata name, or fallback to folder name or filename
|
||||
skill_name = metadata_res.get("name")
|
||||
if not skill_name:
|
||||
if filename == "SKILL.md":
|
||||
skill_name = "unnamed_skill"
|
||||
else:
|
||||
# Use filename without extension
|
||||
skill_name = os.path.splitext(filename)[0]
|
||||
|
||||
# Create a safe directory name for the skill
|
||||
safe_name = _safe_skill_dir_name(skill_name)
|
||||
final_skill_id = f"{safe_name}_{datetime.now().strftime('%Y%m%d%H%M%S')}"
|
||||
|
||||
if project_id is not None:
|
||||
# Prefix the folder name with p{project_id}_ to distinguish projects in storage
|
||||
# without breaking nanobot's 1-level-deep skill loader
|
||||
final_skill_dir = os.path.join(SKILL_HUB_DIR, f"p{project_id}_{final_skill_id}")
|
||||
final_skill_id = f"p{project_id}_{final_skill_id}"
|
||||
else:
|
||||
final_skill_dir = os.path.join(SKILL_HUB_DIR, final_skill_id)
|
||||
|
||||
print(f"Finalizing skill: {skill_name} -> {final_skill_dir}")
|
||||
|
||||
# Move the skill content to final destination
|
||||
os.makedirs(final_skill_dir, exist_ok=True)
|
||||
for item in os.listdir(skill_source_dir):
|
||||
s = os.path.join(skill_source_dir, item)
|
||||
d = os.path.join(final_skill_dir, item)
|
||||
if os.path.isdir(s):
|
||||
shutil.copytree(s, d, dirs_exist_ok=True)
|
||||
else:
|
||||
shutil.copy2(s, d)
|
||||
|
||||
# Register in skills.json
|
||||
data = load_skills()
|
||||
new_skill = {
|
||||
"id": final_skill_id,
|
||||
"name": skill_name,
|
||||
"description": metadata_res.get("description") or "No description provided",
|
||||
"content": metadata_res.get("content") or "",
|
||||
"type": "agentskill",
|
||||
"project_id": project_id,
|
||||
"source": SOURCE_UPLOADED_FILE,
|
||||
"installation_time": datetime.now().strftime("%Y年%m月%d日"),
|
||||
"status": STATUS_SAFE,
|
||||
"file_path": final_skill_dir
|
||||
}
|
||||
|
||||
data.append(new_skill)
|
||||
_save_data(data)
|
||||
print(f"Skill registered successfully: {final_skill_id}")
|
||||
|
||||
return new_skill
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise HTTPException(status_code=500, detail=f"An unexpected error occurred: {str(e)}")
|
||||
finally:
|
||||
# Cleanup temp directory
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
@router.post("/skills", response_model=Skill)
|
||||
def create_skill(skill: SkillCreate):
|
||||
_ensure_skill_hub_dir()
|
||||
data = load_skills()
|
||||
if any(item["id"] == skill.id and item.get("project_id") == skill.project_id for item in data):
|
||||
raise HTTPException(status_code=400, detail="Skill with this ID already exists in this project")
|
||||
|
||||
new_skill_dict = skill.dict()
|
||||
new_skill_dict["source"] = _normalize_source(new_skill_dict.get("source"))
|
||||
new_skill_dict["status"] = _normalize_status(new_skill_dict.get("status"))
|
||||
if not new_skill_dict.get("installation_time"):
|
||||
new_skill_dict["installation_time"] = datetime.now().strftime("%Y年%m月%d日")
|
||||
if not new_skill_dict.get("file_path"):
|
||||
project_id = new_skill_dict.get("project_id")
|
||||
base_dir_name = _safe_skill_dir_name(new_skill_dict["id"])
|
||||
if project_id is not None:
|
||||
# Add prefix for project storage distinction
|
||||
if not base_dir_name.startswith(f"p{project_id}_"):
|
||||
base_dir_name = f"p{project_id}_{base_dir_name}"
|
||||
skill_dir = os.path.join(SKILL_HUB_DIR, base_dir_name)
|
||||
else:
|
||||
skill_dir = os.path.join(SKILL_HUB_DIR, base_dir_name)
|
||||
|
||||
_write_skill_markdown(
|
||||
skill_dir=skill_dir,
|
||||
skill_name=new_skill_dict["name"],
|
||||
description=new_skill_dict.get("description"),
|
||||
content=new_skill_dict.get("content", ""),
|
||||
)
|
||||
new_skill_dict["file_path"] = skill_dir
|
||||
new_skill_dict["id"] = base_dir_name
|
||||
|
||||
data.append(new_skill_dict)
|
||||
_save_data(data)
|
||||
return Skill(**new_skill_dict)
|
||||
|
||||
@router.put("/skills/{skill_id}", response_model=Skill)
|
||||
def update_skill(skill_id: str, skill: SkillUpdate, project_id: Optional[int] = None):
|
||||
data = load_skills()
|
||||
for i, item in enumerate(data):
|
||||
if item["id"] == skill_id:
|
||||
if project_id is not None and item.get("project_id") != project_id:
|
||||
continue
|
||||
updated_item = item.copy()
|
||||
update_data = skill.dict(exclude_unset=True)
|
||||
if "source" in update_data:
|
||||
update_data["source"] = _normalize_source(update_data.get("source"))
|
||||
if "status" in update_data:
|
||||
update_data["status"] = _normalize_status(update_data.get("status"))
|
||||
updated_item.update(update_data)
|
||||
if updated_item.get("file_path"):
|
||||
_write_skill_markdown(
|
||||
skill_dir=updated_item["file_path"],
|
||||
skill_name=updated_item.get("name") or item.get("name") or "skill",
|
||||
description=updated_item.get("description"),
|
||||
content=updated_item.get("content", ""),
|
||||
)
|
||||
data[i] = updated_item
|
||||
_save_data(data)
|
||||
return Skill(**updated_item)
|
||||
raise HTTPException(status_code=404, detail="Skill not found")
|
||||
|
||||
@router.delete("/skills/{skill_id}")
|
||||
def delete_skill(skill_id: str, project_id: Optional[int] = None):
|
||||
data = load_skills()
|
||||
initial_len = len(data)
|
||||
|
||||
# If project_id is provided, we only delete if it matches
|
||||
new_data = []
|
||||
found = False
|
||||
skill_to_delete = None
|
||||
|
||||
for item in data:
|
||||
if item["id"] == skill_id:
|
||||
if item.get("is_builtin"):
|
||||
raise HTTPException(status_code=400, detail="Builtin skills cannot be deleted")
|
||||
if project_id is not None and item.get("project_id") not in (project_id, None):
|
||||
new_data.append(item)
|
||||
continue
|
||||
found = True
|
||||
skill_to_delete = item
|
||||
else:
|
||||
new_data.append(item)
|
||||
|
||||
if not found:
|
||||
raise HTTPException(status_code=404, detail="Skill not found")
|
||||
|
||||
# Clean up file_path if it exists
|
||||
if skill_to_delete and skill_to_delete.get("file_path"):
|
||||
file_path = skill_to_delete["file_path"]
|
||||
if os.path.exists(file_path):
|
||||
try:
|
||||
if os.path.isdir(file_path):
|
||||
shutil.rmtree(file_path)
|
||||
else:
|
||||
os.remove(file_path)
|
||||
except Exception as e:
|
||||
print(f"Error deleting skill files at {file_path}: {e}")
|
||||
|
||||
_save_data(new_data)
|
||||
return {"message": "Skill deleted successfully"}
|
||||
@@ -0,0 +1,106 @@
|
||||
from typing import List
|
||||
from fastapi import APIRouter, HTTPException, Depends
|
||||
from sqlalchemy.orm import Session
|
||||
from app.database import get_db
|
||||
from app.models.subagent import Subagent
|
||||
from app.models.project import Project
|
||||
from app.schemas.subagent import SubagentCreate, SubagentUpdate, Subagent as SubagentSchema
|
||||
from app.core.security import get_current_user, CurrentUser
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/projects/{project_id}/subagents", response_model=List[SubagentSchema])
|
||||
def list_subagents(
|
||||
project_id: int,
|
||||
skip: int = 0,
|
||||
limit: int = 100,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
project = db.query(Project).filter(Project.id == project_id).first()
|
||||
if not project:
|
||||
raise HTTPException(status_code=404, detail="Project not found")
|
||||
|
||||
if not current_user.is_admin and project.owner_id != current_user.id:
|
||||
raise HTTPException(status_code=403, detail="Not enough permissions")
|
||||
|
||||
subagents = db.query(Subagent).filter(Subagent.project_id == project_id).offset(skip).limit(limit).all()
|
||||
return subagents
|
||||
|
||||
@router.post("/projects/{project_id}/subagents", response_model=SubagentSchema)
|
||||
def create_subagent(
|
||||
project_id: int,
|
||||
subagent: SubagentCreate,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
project = db.query(Project).filter(Project.id == project_id).first()
|
||||
if not project:
|
||||
raise HTTPException(status_code=404, detail="Project not found")
|
||||
|
||||
if not current_user.is_admin and project.owner_id != current_user.id:
|
||||
raise HTTPException(status_code=403, detail="Not enough permissions")
|
||||
|
||||
db_subagent = Subagent(**subagent.dict(), project_id=project_id)
|
||||
db.add(db_subagent)
|
||||
db.commit()
|
||||
db.refresh(db_subagent)
|
||||
return db_subagent
|
||||
|
||||
@router.get("/subagents/{subagent_id}", response_model=SubagentSchema)
|
||||
def read_subagent(
|
||||
subagent_id: int,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
db_subagent = db.query(Subagent).filter(Subagent.id == subagent_id).first()
|
||||
if db_subagent is None:
|
||||
raise HTTPException(status_code=404, detail="Subagent not found")
|
||||
|
||||
project = db.query(Project).filter(Project.id == db_subagent.project_id).first()
|
||||
if not current_user.is_admin and project.owner_id != current_user.id:
|
||||
raise HTTPException(status_code=403, detail="Not enough permissions")
|
||||
|
||||
return db_subagent
|
||||
|
||||
@router.put("/subagents/{subagent_id}", response_model=SubagentSchema)
|
||||
def update_subagent(
|
||||
subagent_id: int,
|
||||
subagent: SubagentUpdate,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
db_subagent = db.query(Subagent).filter(Subagent.id == subagent_id).first()
|
||||
if db_subagent is None:
|
||||
raise HTTPException(status_code=404, detail="Subagent not found")
|
||||
|
||||
project = db.query(Project).filter(Project.id == db_subagent.project_id).first()
|
||||
if not current_user.is_admin and project.owner_id != current_user.id:
|
||||
raise HTTPException(status_code=403, detail="Not enough permissions")
|
||||
|
||||
subagent_data = subagent.dict(exclude_unset=True)
|
||||
for key, value in subagent_data.items():
|
||||
setattr(db_subagent, key, value)
|
||||
|
||||
db.add(db_subagent)
|
||||
db.commit()
|
||||
db.refresh(db_subagent)
|
||||
return db_subagent
|
||||
|
||||
@router.delete("/subagents/{subagent_id}")
|
||||
def delete_subagent(
|
||||
subagent_id: int,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: CurrentUser = Depends(get_current_user)
|
||||
):
|
||||
db_subagent = db.query(Subagent).filter(Subagent.id == subagent_id).first()
|
||||
if db_subagent is None:
|
||||
raise HTTPException(status_code=404, detail="Subagent not found")
|
||||
|
||||
project = db.query(Project).filter(Project.id == db_subagent.project_id).first()
|
||||
if not current_user.is_admin and project.owner_id != current_user.id:
|
||||
raise HTTPException(status_code=403, detail="Not enough permissions")
|
||||
|
||||
db.delete(db_subagent)
|
||||
db.commit()
|
||||
return {"status": "success"}
|
||||
@@ -0,0 +1,73 @@
|
||||
from fastapi import APIRouter, UploadFile, File, HTTPException
|
||||
import pandas as pd
|
||||
import duckdb
|
||||
import io
|
||||
import uuid
|
||||
|
||||
from app.core.data_root import get_uploads_root
|
||||
|
||||
router = APIRouter()
|
||||
upload_dir = get_uploads_root()
|
||||
|
||||
@router.post("/upload/file")
|
||||
async def upload_file(file: UploadFile = File(...)):
|
||||
upload_dir.mkdir(parents=True, exist_ok=True)
|
||||
allowed_extensions = ('.csv', '.xls', '.xlsx', '.parquet', '.db', '.sqlite', '.sqlite3')
|
||||
filename_lower = file.filename.lower()
|
||||
if not filename_lower.endswith(allowed_extensions):
|
||||
raise HTTPException(status_code=400, detail="Invalid file type. Allowed: CSV, Excel, Parquet, SQLite.")
|
||||
|
||||
try:
|
||||
content = await file.read()
|
||||
if not content:
|
||||
raise HTTPException(status_code=400, detail="Empty file is not allowed.")
|
||||
file_obj = io.BytesIO(content)
|
||||
|
||||
unique_filename = f"{uuid.uuid4()}-{file.filename}"
|
||||
save_path = upload_dir / unique_filename
|
||||
save_path.write_bytes(content)
|
||||
file_url = f"local://{unique_filename}"
|
||||
|
||||
file_obj.seek(0)
|
||||
|
||||
try:
|
||||
if filename_lower.endswith('.csv'):
|
||||
df = pd.read_csv(file_obj)
|
||||
elif filename_lower.endswith(('.xls', '.xlsx')):
|
||||
df = pd.read_excel(file_obj)
|
||||
elif filename_lower.endswith('.parquet'):
|
||||
df = pd.read_parquet(file_obj)
|
||||
elif filename_lower.endswith(('.db', '.sqlite', '.sqlite3')):
|
||||
# For SQLite, we don't load into DF immediately for analysis here
|
||||
# Just return success
|
||||
return {
|
||||
"filename": unique_filename,
|
||||
"url": file_url,
|
||||
"rows": 0,
|
||||
"columns": [],
|
||||
"summary": "SQLite database uploaded"
|
||||
}
|
||||
|
||||
# For DF supported types
|
||||
duckdb_conn = duckdb.connect(database=':memory:')
|
||||
duckdb_conn.register('uploaded_file', df)
|
||||
summary = duckdb_conn.execute("DESCRIBE uploaded_file").fetchall()
|
||||
row_count = len(df)
|
||||
columns = list(df.columns)
|
||||
|
||||
return {
|
||||
"filename": unique_filename,
|
||||
"url": file_url,
|
||||
"rows": row_count,
|
||||
"columns": columns,
|
||||
"summary": str(summary)
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"filename": unique_filename,
|
||||
"url": file_url,
|
||||
"analysis_error": str(e)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@@ -0,0 +1,215 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException, status, BackgroundTasks
|
||||
from fastapi.security import OAuth2PasswordRequestForm
|
||||
from sqlalchemy.orm import Session
|
||||
from typing import List
|
||||
import secrets
|
||||
import hashlib
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from app.database import get_db
|
||||
from app.models.user import User, EmailVerification
|
||||
from app.schemas.user import UserCreate, UserUpdate, UserResponse, ResendVerificationRequest
|
||||
from app.core.security import get_password_hash, verify_password, create_access_token, ACCESS_TOKEN_EXPIRE_MINUTES
|
||||
from app.core.email import send_verification_email
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
def generate_verification_token() -> str:
|
||||
return secrets.token_urlsafe(32)
|
||||
|
||||
def hash_token(token: str) -> str:
|
||||
return hashlib.sha256(token.encode()).hexdigest()
|
||||
|
||||
@router.post("/auth/login")
|
||||
def login(form_data: OAuth2PasswordRequestForm = Depends(), db: Session = Depends(get_db)):
|
||||
user = db.query(User).filter(User.username == form_data.username).first()
|
||||
if not user or not verify_password(form_data.password, user.hashed_password):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Incorrect username or password",
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
|
||||
if not user.is_active:
|
||||
raise HTTPException(status_code=400, detail="Inactive user")
|
||||
|
||||
access_token_expires = timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
|
||||
access_token = create_access_token(
|
||||
data={"sub": user.username, "is_admin": user.is_admin, "id": user.id},
|
||||
expires_delta=access_token_expires
|
||||
)
|
||||
|
||||
return {
|
||||
"access_token": access_token,
|
||||
"token_type": "bearer",
|
||||
"user": {
|
||||
"id": user.id,
|
||||
"username": user.username,
|
||||
"email": user.email,
|
||||
"avatar": user.avatar,
|
||||
"is_admin": user.is_admin
|
||||
}
|
||||
}
|
||||
|
||||
@router.post("/auth/register", response_model=UserResponse)
|
||||
def register_user(user: UserCreate, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
|
||||
db_user = db.query(User).filter(User.username == user.username).first()
|
||||
if db_user:
|
||||
raise HTTPException(status_code=400, detail="Username already registered")
|
||||
|
||||
db_user_email = db.query(User).filter(User.email == user.email).first()
|
||||
if db_user_email:
|
||||
raise HTTPException(status_code=400, detail="Email already registered")
|
||||
|
||||
hashed_password = get_password_hash(user.password)
|
||||
|
||||
# If this is the first user, make them an admin
|
||||
is_first_user = db.query(User).count() == 0
|
||||
is_admin = is_first_user or user.is_admin
|
||||
is_active = True if is_first_user else False
|
||||
|
||||
db_user = User(
|
||||
username=user.username,
|
||||
email=user.email,
|
||||
avatar=user.avatar,
|
||||
hashed_password=hashed_password,
|
||||
is_active=is_active,
|
||||
is_admin=is_admin
|
||||
)
|
||||
db.add(db_user)
|
||||
db.commit()
|
||||
db.refresh(db_user)
|
||||
|
||||
if not is_active:
|
||||
token = generate_verification_token()
|
||||
hashed = hash_token(token)
|
||||
expires_at = datetime.now(timezone.utc) + timedelta(hours=24)
|
||||
verification = EmailVerification(
|
||||
user_id=db_user.id,
|
||||
token_hash=hashed,
|
||||
expires_at=expires_at
|
||||
)
|
||||
db.add(verification)
|
||||
db.commit()
|
||||
|
||||
# 将用户的 email 保存到局部变量中,防止在后台任务执行前 session 关闭导致延迟加载失败
|
||||
user_email = db_user.email
|
||||
background_tasks.add_task(send_verification_email, user_email, token)
|
||||
|
||||
return db_user
|
||||
|
||||
@router.get("/auth/verify-email")
|
||||
def verify_email(token: str, db: Session = Depends(get_db)):
|
||||
hashed = hash_token(token)
|
||||
verification = db.query(EmailVerification).filter(
|
||||
EmailVerification.token_hash == hashed,
|
||||
EmailVerification.is_used == False
|
||||
).first()
|
||||
|
||||
if not verification:
|
||||
raise HTTPException(status_code=400, detail="Invalid or used token")
|
||||
|
||||
# Check if expired (make timezone-aware if naive)
|
||||
expires_at = verification.expires_at
|
||||
if expires_at.tzinfo is None:
|
||||
expires_at = expires_at.replace(tzinfo=timezone.utc)
|
||||
|
||||
if expires_at < datetime.now(timezone.utc):
|
||||
raise HTTPException(status_code=400, detail="Token expired")
|
||||
|
||||
user = db.query(User).filter(User.id == verification.user_id).first()
|
||||
if not user:
|
||||
raise HTTPException(status_code=404, detail="User not found")
|
||||
|
||||
user.is_active = True
|
||||
verification.is_used = True
|
||||
db.commit()
|
||||
|
||||
return {"status": "success", "message": "Email verified successfully"}
|
||||
|
||||
@router.post("/auth/resend-verification")
|
||||
def resend_verification(request: ResendVerificationRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
|
||||
user = db.query(User).filter(User.username == request.username).first()
|
||||
if not user:
|
||||
raise HTTPException(status_code=404, detail="User not found")
|
||||
|
||||
if user.is_active:
|
||||
raise HTTPException(status_code=400, detail="User already active")
|
||||
|
||||
token = generate_verification_token()
|
||||
hashed = hash_token(token)
|
||||
expires_at = datetime.now(timezone.utc) + timedelta(hours=24)
|
||||
verification = EmailVerification(
|
||||
user_id=user.id,
|
||||
token_hash=hashed,
|
||||
expires_at=expires_at
|
||||
)
|
||||
db.add(verification)
|
||||
db.commit()
|
||||
|
||||
# 提取 email,避免后台任务访问已断开的 db session
|
||||
user_email = user.email
|
||||
background_tasks.add_task(send_verification_email, user_email, token)
|
||||
return {"status": "success", "message": "Verification email sent"}
|
||||
|
||||
@router.get("/users", response_model=List[UserResponse])
|
||||
def read_users(skip: int = 0, limit: int = 100, db: Session = Depends(get_db)):
|
||||
users = db.query(User).offset(skip).limit(limit).all()
|
||||
return users
|
||||
|
||||
@router.get("/users/{user_id}", response_model=UserResponse)
|
||||
def read_user(user_id: int, db: Session = Depends(get_db)):
|
||||
db_user = db.query(User).filter(User.id == user_id).first()
|
||||
if db_user is None:
|
||||
raise HTTPException(status_code=404, detail="User not found")
|
||||
return db_user
|
||||
|
||||
@router.post("/users", response_model=UserResponse)
|
||||
def create_user(user: UserCreate, db: Session = Depends(get_db)):
|
||||
db_user = db.query(User).filter(User.username == user.username).first()
|
||||
if db_user:
|
||||
raise HTTPException(status_code=400, detail="Username already registered")
|
||||
|
||||
db_user_email = db.query(User).filter(User.email == user.email).first()
|
||||
if db_user_email:
|
||||
raise HTTPException(status_code=400, detail="Email already registered")
|
||||
|
||||
db_user = User(
|
||||
username=user.username,
|
||||
email=user.email,
|
||||
avatar=user.avatar,
|
||||
hashed_password=get_password_hash(user.password),
|
||||
is_active=user.is_active,
|
||||
is_admin=user.is_admin
|
||||
)
|
||||
db.add(db_user)
|
||||
db.commit()
|
||||
db.refresh(db_user)
|
||||
return db_user
|
||||
|
||||
@router.put("/users/{user_id}", response_model=UserResponse)
|
||||
def update_user(user_id: int, user: UserUpdate, db: Session = Depends(get_db)):
|
||||
db_user = db.query(User).filter(User.id == user_id).first()
|
||||
if not db_user:
|
||||
raise HTTPException(status_code=404, detail="User not found")
|
||||
|
||||
update_data = user.model_dump(exclude_unset=True)
|
||||
for key, value in update_data.items():
|
||||
if key == "password" and value:
|
||||
db_user.hashed_password = get_password_hash(value)
|
||||
elif key != "password":
|
||||
setattr(db_user, key, value)
|
||||
|
||||
db.commit()
|
||||
db.refresh(db_user)
|
||||
return db_user
|
||||
|
||||
@router.delete("/users/{user_id}")
|
||||
def delete_user(user_id: int, db: Session = Depends(get_db)):
|
||||
db_user = db.query(User).filter(User.id == user_id).first()
|
||||
if not db_user:
|
||||
raise HTTPException(status_code=404, detail="User not found")
|
||||
|
||||
db.delete(db_user)
|
||||
db.commit()
|
||||
return {"ok": True}
|
||||
@@ -0,0 +1,31 @@
|
||||
from typing import Optional, Dict, Any
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from app.api.llm import get_current_user, get_admin_user, CurrentUser
|
||||
from app.services.web_search_config_store import get_web_search_config, save_web_search_config
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
class WebSearchConfigModel(BaseModel):
|
||||
provider: str = Field(default="duckduckgo", description="Web search provider (brave, tavily, duckduckgo, searxng, jina)")
|
||||
api_key: Optional[str] = Field(default="", description="API Key for the provider")
|
||||
base_url: Optional[str] = Field(default="", description="Base URL for SearXNG")
|
||||
max_results: int = Field(default=5, description="Maximum number of search results")
|
||||
|
||||
def _sanitize_config(config: Dict[str, Any], is_admin: bool) -> Dict[str, Any]:
|
||||
sanitized = config.copy()
|
||||
if not is_admin:
|
||||
sanitized["api_key"] = None
|
||||
return sanitized
|
||||
|
||||
@router.get("/web-search/config", response_model=WebSearchConfigModel)
|
||||
def get_config(current_user: CurrentUser = Depends(get_current_user)):
|
||||
config = get_web_search_config()
|
||||
return WebSearchConfigModel(**_sanitize_config(config, current_user.is_admin))
|
||||
|
||||
@router.put("/web-search/config", response_model=WebSearchConfigModel)
|
||||
def update_config(config: WebSearchConfigModel, _: CurrentUser = Depends(get_admin_user)):
|
||||
config_dict = config.dict()
|
||||
save_web_search_config(config_dict)
|
||||
return WebSearchConfigModel(**config_dict)
|
||||
@@ -0,0 +1,231 @@
|
||||
import json
|
||||
import os
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import typer
|
||||
from rich.console import Console
|
||||
|
||||
from app.core.data_root import get_data_root
|
||||
|
||||
app = typer.Typer(
|
||||
name="dataclaw",
|
||||
context_settings={"help_option_names": ["-h", "--help"]},
|
||||
help="全源灵动 WebUI 服务控制命令",
|
||||
no_args_is_help=True,
|
||||
)
|
||||
console = Console()
|
||||
|
||||
|
||||
def _default_pid_file() -> Path:
|
||||
return get_data_root() / "run" / "dataclaw-webui.json"
|
||||
|
||||
|
||||
def _default_log_file() -> Path:
|
||||
return get_data_root() / "run" / "dataclaw-webui.log"
|
||||
|
||||
|
||||
def _resolve_path(value: str | None, fallback: Path) -> Path:
|
||||
if value:
|
||||
return Path(value).expanduser().resolve()
|
||||
return fallback
|
||||
|
||||
|
||||
def _ensure_parent(path: Path) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
def _read_state(pid_file: Path) -> dict[str, Any] | None:
|
||||
if not pid_file.exists():
|
||||
return None
|
||||
try:
|
||||
return json.loads(pid_file.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def _write_state(pid_file: Path, state: dict[str, Any]) -> None:
|
||||
_ensure_parent(pid_file)
|
||||
pid_file.write_text(json.dumps(state, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def _remove_state(pid_file: Path) -> None:
|
||||
try:
|
||||
pid_file.unlink()
|
||||
except FileNotFoundError:
|
||||
return
|
||||
|
||||
|
||||
def _is_process_running(pid: int) -> bool:
|
||||
if pid <= 0:
|
||||
return False
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
except OSError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _wait_for_server_ready(host: str, port: int, timeout: float) -> bool:
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout=0.5):
|
||||
return True
|
||||
except OSError:
|
||||
time.sleep(0.2)
|
||||
return False
|
||||
|
||||
|
||||
def _build_uvicorn_command(host: str, port: int, reload: bool, log_level: str, app_target: str) -> list[str]:
|
||||
command = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"uvicorn",
|
||||
app_target,
|
||||
"--host",
|
||||
host,
|
||||
"--port",
|
||||
str(port),
|
||||
"--log-level",
|
||||
log_level,
|
||||
]
|
||||
if reload:
|
||||
command.append("--reload")
|
||||
return command
|
||||
|
||||
|
||||
def _stop_pid(pid: int, timeout: float) -> bool:
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except OSError:
|
||||
return True
|
||||
deadline = time.time() + timeout
|
||||
while time.time() < deadline:
|
||||
if not _is_process_running(pid):
|
||||
return True
|
||||
time.sleep(0.2)
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
return True
|
||||
return not _is_process_running(pid)
|
||||
|
||||
|
||||
@app.command()
|
||||
def start(
|
||||
host: str = typer.Option("127.0.0.1", "--host", help="服务监听地址"),
|
||||
port: int = typer.Option(8000, "--port", "-p", help="服务端口"),
|
||||
reload: bool = typer.Option(False, "--reload", "-r", help="开启自动重载(开发模式)"),
|
||||
log_level: str = typer.Option("info", "--log-level", help="日志级别"),
|
||||
app_target: str = typer.Option("main:app", "--app", help="ASGI 应用导入路径"),
|
||||
ready_timeout: float = typer.Option(60.0, "--ready-timeout", help="就绪等待时长(秒)"),
|
||||
pid_file: str | None = typer.Option(None, "--pid-file", help="PID 状态文件路径"),
|
||||
log_file: str | None = typer.Option(None, "--log-file", help="服务日志文件路径"),
|
||||
) -> None:
|
||||
pid_path = _resolve_path(pid_file, _default_pid_file())
|
||||
log_path = _resolve_path(log_file, _default_log_file())
|
||||
|
||||
state = _read_state(pid_path)
|
||||
if state:
|
||||
pid = int(state.get("pid", 0))
|
||||
if _is_process_running(pid):
|
||||
existing_host = state.get("host", host)
|
||||
existing_port = state.get("port", port)
|
||||
console.print(f"[yellow]⚠[/yellow] dataclaw 已在运行: pid={pid}, url=http://{existing_host}:{existing_port}")
|
||||
raise typer.Exit(1)
|
||||
_remove_state(pid_path)
|
||||
console.print("[yellow]⚠[/yellow] 检测到过期状态文件,已自动清理")
|
||||
|
||||
_ensure_parent(log_path)
|
||||
command = _build_uvicorn_command(host, port, reload, log_level, app_target)
|
||||
log_handle = log_path.open("a", encoding="utf-8")
|
||||
process = subprocess.Popen(
|
||||
command,
|
||||
stdout=log_handle,
|
||||
stderr=subprocess.STDOUT,
|
||||
start_new_session=True,
|
||||
)
|
||||
log_handle.close()
|
||||
|
||||
service_state = {
|
||||
"pid": process.pid,
|
||||
"host": host,
|
||||
"port": port,
|
||||
"app": app_target,
|
||||
"log_file": str(log_path),
|
||||
"started_at": int(time.time()),
|
||||
}
|
||||
_write_state(pid_path, service_state)
|
||||
|
||||
ready = _wait_for_server_ready(host, port, ready_timeout)
|
||||
if ready:
|
||||
console.print(f"[green]✓[/green] dataclaw 已启动: pid={process.pid}")
|
||||
console.print(f"[green]✓[/green] WebUI 地址: http://{host}:{port}")
|
||||
console.print(f"[green]✓[/green] 日志文件: {log_path}")
|
||||
return
|
||||
|
||||
code = process.poll()
|
||||
if code is not None:
|
||||
_remove_state(pid_path)
|
||||
console.print(f"[red]✗[/red] dataclaw 启动失败,进程已退出 (code={code})")
|
||||
console.print(f"[yellow]日志文件[/yellow]: {log_path}")
|
||||
raise typer.Exit(1)
|
||||
|
||||
console.print(f"[yellow]⚠[/yellow] 服务已拉起但未在 {ready_timeout:.1f}s 内确认就绪")
|
||||
console.print(f"[yellow]请检查日志[/yellow]: {log_path}")
|
||||
|
||||
|
||||
@app.command()
|
||||
def status(
|
||||
pid_file: str | None = typer.Option(None, "--pid-file", help="PID 状态文件路径"),
|
||||
) -> None:
|
||||
pid_path = _resolve_path(pid_file, _default_pid_file())
|
||||
state = _read_state(pid_path)
|
||||
if not state:
|
||||
console.print("[yellow]●[/yellow] dataclaw 状态: stopped")
|
||||
return
|
||||
|
||||
pid = int(state.get("pid", 0))
|
||||
if _is_process_running(pid):
|
||||
host = state.get("host", "127.0.0.1")
|
||||
port = state.get("port", 8000)
|
||||
console.print("[green]●[/green] dataclaw 状态: running")
|
||||
console.print(f"[green]pid[/green]: {pid}")
|
||||
console.print(f"[green]url[/green]: http://{host}:{port}")
|
||||
return
|
||||
|
||||
_remove_state(pid_path)
|
||||
console.print("[yellow]●[/yellow] dataclaw 状态: stopped (已清理过期状态文件)")
|
||||
|
||||
|
||||
@app.command()
|
||||
def stop(
|
||||
timeout: float = typer.Option(8.0, "--timeout", help="停止等待时长(秒)"),
|
||||
pid_file: str | None = typer.Option(None, "--pid-file", help="PID 状态文件路径"),
|
||||
) -> None:
|
||||
pid_path = _resolve_path(pid_file, _default_pid_file())
|
||||
state = _read_state(pid_path)
|
||||
if not state:
|
||||
console.print("[yellow]⚠[/yellow] dataclaw 未运行")
|
||||
return
|
||||
|
||||
pid = int(state.get("pid", 0))
|
||||
if not _is_process_running(pid):
|
||||
_remove_state(pid_path)
|
||||
console.print("[yellow]⚠[/yellow] dataclaw 进程不存在,已清理状态文件")
|
||||
return
|
||||
|
||||
stopped = _stop_pid(pid, timeout)
|
||||
if stopped:
|
||||
_remove_state(pid_path)
|
||||
console.print(f"[green]✓[/green] dataclaw 已停止: pid={pid}")
|
||||
return
|
||||
|
||||
console.print(f"[red]✗[/red] dataclaw 停止失败: pid={pid}")
|
||||
raise typer.Exit(1)
|
||||
@@ -0,0 +1,50 @@
|
||||
from clickhouse_driver import Client
|
||||
import os
|
||||
|
||||
class ClickHouseConnector:
|
||||
def __init__(self, host: str = None, port: int = 9000, user: str = 'default', password: str = '', database: str = 'default'):
|
||||
self.host = host or os.getenv("CLICKHOUSE_HOST", "localhost")
|
||||
self.port = port or int(os.getenv("CLICKHOUSE_PORT", 9000))
|
||||
self.user = user or os.getenv("CLICKHOUSE_USER", "default")
|
||||
self.password = password or os.getenv("CLICKHOUSE_PASSWORD", "")
|
||||
self.database = database or os.getenv("CLICKHOUSE_DB", "default")
|
||||
|
||||
self.client = Client(
|
||||
host=self.host,
|
||||
port=self.port,
|
||||
user=self.user,
|
||||
password=self.password,
|
||||
database=self.database
|
||||
)
|
||||
|
||||
def execute_query(self, query: str):
|
||||
try:
|
||||
return self.client.execute(query, with_column_types=True)
|
||||
except Exception as e:
|
||||
print(f"ClickHouse Query Error: {e}")
|
||||
raise e
|
||||
|
||||
def get_schema(self):
|
||||
query = "SELECT table, name, type FROM system.columns WHERE database = currentDatabase()"
|
||||
try:
|
||||
results = self.client.execute(query)
|
||||
schema = {}
|
||||
for row in results:
|
||||
table = row[0]
|
||||
if table not in schema:
|
||||
schema[table] = []
|
||||
schema[table].append({"name": row[1], "type": row[2]})
|
||||
return schema
|
||||
except Exception as e:
|
||||
print(f"Error getting schema: {e}")
|
||||
return {}
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
try:
|
||||
self.client.execute("SELECT 1")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"ClickHouse Connection Error: {e}")
|
||||
return False
|
||||
|
||||
clickhouse_connector = ClickHouseConnector()
|
||||
@@ -0,0 +1,68 @@
|
||||
import duckdb
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any
|
||||
import os
|
||||
from app.core.files import resolve_upload_file_path
|
||||
|
||||
class CSVConnector:
|
||||
def __init__(self, file_path: str):
|
||||
self.file_path = file_path
|
||||
if not os.path.exists(self.file_path):
|
||||
raise FileNotFoundError(f"CSV file not found: {self.file_path}")
|
||||
|
||||
def _get_table_name(self) -> str:
|
||||
# Normalize table name to be SQL safe-ish
|
||||
base = os.path.splitext(os.path.basename(self.file_path))[0]
|
||||
# Replace non-alphanumeric chars with underscore
|
||||
safe_name = "".join([c if c.isalnum() else "_" for c in base])
|
||||
# Ensure it doesn't start with a number
|
||||
if safe_name and safe_name[0].isdigit():
|
||||
safe_name = f"t_{safe_name}"
|
||||
return safe_name
|
||||
|
||||
def execute_query(self, query: str) -> List[Dict[str, Any]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
table_name = self._get_table_name()
|
||||
|
||||
# Register the csv file as a view
|
||||
# read_csv_auto is powerful
|
||||
try:
|
||||
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
|
||||
|
||||
# Execute the user query
|
||||
# The query should rely on the table name provided in schema
|
||||
df = conn.execute(query).df()
|
||||
return df.to_dict(orient="records")
|
||||
except Exception as e:
|
||||
print(f"CSV Query Error: {e}")
|
||||
raise e
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
table_name = self._get_table_name()
|
||||
|
||||
try:
|
||||
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_csv_auto('{self.file_path}')")
|
||||
|
||||
# Get columns
|
||||
columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
||||
# col[0] is name, col[1] is type
|
||||
schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
|
||||
return schema
|
||||
except Exception as e:
|
||||
print(f"Error getting schema: {e}")
|
||||
return {}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
try:
|
||||
conn = duckdb.connect(":memory:")
|
||||
conn.execute(f"SELECT * FROM read_csv_auto('{self.file_path}') LIMIT 1")
|
||||
conn.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"CSV Connection Error: {e}")
|
||||
return False
|
||||
@@ -0,0 +1,48 @@
|
||||
import duckdb
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any, Optional
|
||||
import os
|
||||
|
||||
class DuckDBConnector:
|
||||
def __init__(self, db_path: str = ":memory:"):
|
||||
self.db_path = db_path
|
||||
|
||||
def execute_query(self, query: str) -> List[Dict[str, Any]]:
|
||||
conn = duckdb.connect(self.db_path)
|
||||
try:
|
||||
df = conn.execute(query).df()
|
||||
return df.to_dict(orient="records")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_schema(self) -> Dict[str, Any]:
|
||||
conn = duckdb.connect(self.db_path)
|
||||
try:
|
||||
schema = {}
|
||||
tables = conn.execute("SHOW TABLES").fetchall()
|
||||
for (table_name,) in tables:
|
||||
columns_info = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
||||
columns = []
|
||||
for col in columns_info:
|
||||
columns.append({
|
||||
"name": col[0],
|
||||
"type": col[1]
|
||||
})
|
||||
schema[table_name] = {
|
||||
"columns": columns,
|
||||
"primary_keys": [], # DuckDB describe doesn't easily show PKs in this format
|
||||
"foreign_keys": []
|
||||
}
|
||||
return schema
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
try:
|
||||
conn = duckdb.connect(self.db_path)
|
||||
conn.execute("SELECT 1")
|
||||
conn.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"DuckDB Connection Error: {e}")
|
||||
return False
|
||||
@@ -0,0 +1,82 @@
|
||||
from typing import Dict, Any, Optional
|
||||
import json
|
||||
import functools
|
||||
from app.connectors.postgres import PostgresConnector
|
||||
from app.connectors.clickhouse import ClickHouseConnector
|
||||
from app.connectors.parquet import ParquetConnector
|
||||
from app.connectors.csv import CSVConnector
|
||||
from app.connectors.duckdb import DuckDBConnector
|
||||
from app.models.datasource import DataSource
|
||||
from app.core.files import resolve_upload_file_path
|
||||
|
||||
@functools.lru_cache(maxsize=32)
|
||||
def _get_cached_connector(ds_type: str, config_json: str):
|
||||
config = json.loads(config_json)
|
||||
|
||||
if ds_type in ["postgres", "postgresql", "supabase"]:
|
||||
db_url = config.get("connection_string")
|
||||
if not db_url:
|
||||
default_port = 6543 if ds_type == "supabase" else 5432
|
||||
port = config.get("port") or default_port
|
||||
db_url = f"postgresql://{config.get('user')}:{config.get('password')}@{config.get('host')}:{port}/{config.get('database')}"
|
||||
|
||||
if ds_type == "supabase" and "?" not in db_url:
|
||||
db_url += "?sslmode=require"
|
||||
elif ds_type == "supabase" and "sslmode=" not in db_url:
|
||||
db_url += "&sslmode=require"
|
||||
|
||||
return PostgresConnector(db_url=db_url)
|
||||
|
||||
elif ds_type == "mysql":
|
||||
db_url = config.get("connection_string")
|
||||
if not db_url:
|
||||
port = config.get("port") or 3306
|
||||
db_url = f"mysql+pymysql://{config.get('user')}:{config.get('password')}@{config.get('host')}:{port}/{config.get('database')}"
|
||||
elif not db_url.startswith("mysql+pymysql://"):
|
||||
db_url = db_url.replace("mysql://", "mysql+pymysql://")
|
||||
return PostgresConnector(db_url=db_url)
|
||||
|
||||
elif ds_type == "sqlite":
|
||||
# SQLite uses connection string usually file path
|
||||
db_url = config.get("connection_string")
|
||||
if not db_url and config.get("file_path"):
|
||||
file_path = str(resolve_upload_file_path(config.get("file_path")))
|
||||
db_url = f"sqlite:///{file_path}"
|
||||
return PostgresConnector(db_url=db_url)
|
||||
|
||||
elif ds_type == "clickhouse":
|
||||
return ClickHouseConnector(
|
||||
host=config.get("host"),
|
||||
port=config.get("port", 9000),
|
||||
user=config.get("user", "default"),
|
||||
password=config.get("password", ""),
|
||||
database=config.get("database", "default")
|
||||
)
|
||||
|
||||
elif ds_type == "duckdb":
|
||||
db_path = config.get("database") or config.get("file_path") or ":memory:"
|
||||
if db_path != ":memory:":
|
||||
db_path = str(resolve_upload_file_path(db_path))
|
||||
return DuckDBConnector(db_path=db_path)
|
||||
|
||||
elif ds_type == "parquet":
|
||||
file_path = str(resolve_upload_file_path(config.get("file_path")))
|
||||
return ParquetConnector(file_path=file_path)
|
||||
|
||||
elif ds_type == "csv":
|
||||
file_path = str(resolve_upload_file_path(config.get("file_path")))
|
||||
return CSVConnector(file_path=file_path)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported data source type: {ds_type}")
|
||||
|
||||
def get_connector(datasource: DataSource):
|
||||
# Use JSON string of config as cache key
|
||||
# Ensure stable ordering of keys
|
||||
config_str = json.dumps(datasource.config, sort_keys=True)
|
||||
return _get_cached_connector(datasource.type.lower(), config_str)
|
||||
|
||||
def get_connector_from_config(ds_type: str, config: Dict[str, Any]):
|
||||
# Helper for testing connection without saving to DB
|
||||
config_str = json.dumps(config, sort_keys=True)
|
||||
return _get_cached_connector(ds_type.lower(), config_str)
|
||||
@@ -0,0 +1,58 @@
|
||||
import duckdb
|
||||
import pandas as pd
|
||||
from typing import List, Dict, Any
|
||||
import os
|
||||
|
||||
class ParquetConnector:
|
||||
def __init__(self, file_path: str):
|
||||
self.file_path = file_path
|
||||
if not os.path.exists(self.file_path):
|
||||
raise FileNotFoundError(f"Parquet file not found: {self.file_path}")
|
||||
|
||||
def execute_query(self, query: str) -> List[Dict[str, Any]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
# Register the parquet file as a view or table
|
||||
# We can use read_parquet directly in query, or register it.
|
||||
# Let's register it as 'parquet_table' for simplicity in generated SQL,
|
||||
# or we can ask LLM to use the filename.
|
||||
# A better approach for generic SQL is to register it as a table name derived from filename or just 'data'.
|
||||
table_name = os.path.splitext(os.path.basename(self.file_path))[0]
|
||||
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
|
||||
|
||||
# If the query doesn't use the table name, we might have issues.
|
||||
# But usually we provide schema with table name to LLM.
|
||||
try:
|
||||
# DuckDB returns a dataframe, we convert to dict
|
||||
df = conn.execute(query).df()
|
||||
return df.to_dict(orient="records")
|
||||
except Exception as e:
|
||||
print(f"Parquet Query Error: {e}")
|
||||
raise e
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_schema(self) -> Dict[str, List[Dict[str, str]]]:
|
||||
conn = duckdb.connect(":memory:")
|
||||
table_name = os.path.splitext(os.path.basename(self.file_path))[0]
|
||||
conn.execute(f"CREATE OR REPLACE VIEW {table_name} AS SELECT * FROM read_parquet('{self.file_path}')")
|
||||
|
||||
try:
|
||||
# Get columns
|
||||
columns = conn.execute(f"DESCRIBE {table_name}").fetchall()
|
||||
schema = {table_name: [{"name": col[0], "type": col[1]} for col in columns]}
|
||||
return schema
|
||||
except Exception as e:
|
||||
print(f"Error getting schema: {e}")
|
||||
return {}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
try:
|
||||
conn = duckdb.connect(":memory:")
|
||||
conn.execute(f"SELECT * FROM read_parquet('{self.file_path}') LIMIT 1")
|
||||
conn.close()
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Parquet Connection Error: {e}")
|
||||
return False
|
||||
@@ -0,0 +1,113 @@
|
||||
from sqlalchemy import create_engine, text
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
from typing import Generator
|
||||
import os
|
||||
|
||||
class PostgresConnector:
|
||||
def __init__(self, db_url: str = None):
|
||||
self.db_url = db_url or os.getenv("POSTGRES_URL", "postgresql://user:password@localhost:5432/dbname")
|
||||
self.engine = create_engine(self.db_url)
|
||||
self.SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=self.engine)
|
||||
|
||||
def get_db(self) -> Generator:
|
||||
db = self.SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
def execute_query(self, query: str):
|
||||
with self.engine.connect() as connection:
|
||||
result = connection.execute(text(query))
|
||||
return [dict(row._mapping) for row in result]
|
||||
|
||||
def get_schema(self):
|
||||
try:
|
||||
from sqlalchemy import inspect
|
||||
inspector = inspect(self.engine)
|
||||
schema = {}
|
||||
# Default schema for postgres is 'public', sqlite is None
|
||||
schema_name = 'public' if self.engine.dialect.name == 'postgresql' else None
|
||||
|
||||
table_names = inspector.get_table_names(schema=schema_name)
|
||||
|
||||
# Use SQLAlchemy 2.0+ multi-fetch to avoid N+1 queries issue, especially over remote networks
|
||||
if hasattr(inspector, 'get_multi_columns'):
|
||||
multi_columns = inspector.get_multi_columns(schema=schema_name)
|
||||
multi_pk = inspector.get_multi_pk_constraint(schema=schema_name)
|
||||
multi_fk = inspector.get_multi_foreign_keys(schema=schema_name)
|
||||
|
||||
for table_name in table_names:
|
||||
key = (schema_name, table_name)
|
||||
|
||||
columns = []
|
||||
for col in multi_columns.get(key, []):
|
||||
columns.append({
|
||||
"name": col['name'],
|
||||
"type": str(col['type'])
|
||||
})
|
||||
|
||||
pk_constraint = multi_pk.get(key)
|
||||
pks = pk_constraint.get('constrained_columns', []) if pk_constraint else []
|
||||
|
||||
foreign_keys = []
|
||||
for fk in multi_fk.get(key, []):
|
||||
foreign_keys.append({
|
||||
"constrained_columns": fk['constrained_columns'],
|
||||
"referred_table": fk['referred_table'],
|
||||
"referred_columns": fk['referred_columns']
|
||||
})
|
||||
|
||||
schema[table_name] = {
|
||||
"columns": columns,
|
||||
"primary_keys": pks,
|
||||
"foreign_keys": foreign_keys
|
||||
}
|
||||
return schema
|
||||
|
||||
# Fallback for older SQLAlchemy versions
|
||||
for table_name in table_names:
|
||||
columns = []
|
||||
# get columns
|
||||
for col in inspector.get_columns(table_name, schema=schema_name):
|
||||
columns.append({
|
||||
"name": col['name'],
|
||||
"type": str(col['type'])
|
||||
})
|
||||
|
||||
# get primary key
|
||||
pk_constraint = inspector.get_pk_constraint(table_name, schema=schema_name)
|
||||
pks = pk_constraint.get('constrained_columns', []) if pk_constraint else []
|
||||
|
||||
# get foreign keys
|
||||
fks = inspector.get_foreign_keys(table_name, schema=schema_name)
|
||||
foreign_keys = []
|
||||
for fk in fks:
|
||||
foreign_keys.append({
|
||||
"constrained_columns": fk['constrained_columns'],
|
||||
"referred_table": fk['referred_table'],
|
||||
"referred_columns": fk['referred_columns']
|
||||
})
|
||||
|
||||
schema[table_name] = {
|
||||
"columns": columns,
|
||||
"primary_keys": pks,
|
||||
"foreign_keys": foreign_keys
|
||||
}
|
||||
return schema
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(f"Error getting schema: {e}")
|
||||
raise e
|
||||
|
||||
def test_connection(self) -> bool:
|
||||
try:
|
||||
with self.engine.connect() as connection:
|
||||
connection.execute(text("SELECT 1"))
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"PostgreSQL Connection Error: {e}")
|
||||
raise e
|
||||
|
||||
postgres_connector = PostgresConnector()
|
||||
@@ -0,0 +1,23 @@
|
||||
from contextvars import ContextVar
|
||||
from typing import Any, Callable, Awaitable, Dict, Optional
|
||||
|
||||
# The current session ID processing the request
|
||||
current_session_id: ContextVar[str] = ContextVar("current_session_id", default="")
|
||||
|
||||
# A callback to send progress updates to the frontend during tool execution
|
||||
current_progress_callback: ContextVar[Optional[Callable[[str], Awaitable[None]]]] = ContextVar("current_progress_callback", default=None)
|
||||
|
||||
# A payload dictionary to store visualization results generated by tools
|
||||
# This will be picked up by the stream handler and sent to the frontend
|
||||
current_viz_data: ContextVar[Optional[Dict[str, Any]]] = ContextVar("current_viz_data", default=None)
|
||||
|
||||
# Store the last queried data so the Visualization Tool can access it
|
||||
current_data: ContextVar[Optional[list]] = ContextVar("current_data", default=None)
|
||||
|
||||
# The data source requested by the user or bound to the session
|
||||
current_data_source: ContextVar[str] = ContextVar("current_data_source", default="postgres")
|
||||
|
||||
# Any file URL attached to the request
|
||||
current_file_url: ContextVar[Optional[str]] = ContextVar("current_file_url", default=None)
|
||||
|
||||
current_knowledge_base_id: ContextVar[Optional[str]] = ContextVar("current_knowledge_base_id", default=None)
|
||||
@@ -0,0 +1,203 @@
|
||||
import mimetypes
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
from urllib.parse import quote
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.core.data_root import get_data_root, get_reports_root, get_uploads_root, get_workspace_root
|
||||
|
||||
LOCAL_URI_PATTERN = re.compile(r"local://[^\s<>'\"\]\)\}]+")
|
||||
PATH_PATTERN = re.compile(
|
||||
r"(?:[A-Za-z]:[\\/][^\s<>'\"]+\.[A-Za-z0-9]{1,12}|/[^\s<>'\"]+\.[A-Za-z0-9]{1,12}|(?:\.\./|\.?/)?(?:[\w\-.]+[\\/])+[\w\-.]+\.[A-Za-z0-9]{1,12})"
|
||||
)
|
||||
REPORT_PATH_PATTERN = re.compile(r"data[\\/]data[\\/][\w\-.]+\.[A-Za-z0-9]{1,12}", re.IGNORECASE)
|
||||
PREVIEWABLE_EXTENSIONS = {
|
||||
".html",
|
||||
".htm",
|
||||
".pdf",
|
||||
".pptx",
|
||||
".txt",
|
||||
".md",
|
||||
".json",
|
||||
".csv",
|
||||
".tsv",
|
||||
".yaml",
|
||||
".yml",
|
||||
".xml",
|
||||
".log",
|
||||
}
|
||||
|
||||
|
||||
class ArtifactPayload(BaseModel):
|
||||
name: str
|
||||
mime_type: str
|
||||
size: int
|
||||
download_url: str
|
||||
previewable: bool
|
||||
preview_url: str | None = None
|
||||
|
||||
|
||||
def extract_artifacts(content: str, session_messages: list[dict[str, Any]] | None = None) -> list[dict[str, Any]]:
|
||||
candidates = _collect_candidate_texts(content, session_messages or [])
|
||||
ordered_locators: list[str] = []
|
||||
seen_locators: set[str] = set()
|
||||
for text in candidates:
|
||||
for locator in _extract_locators(text):
|
||||
if locator in seen_locators:
|
||||
continue
|
||||
seen_locators.add(locator)
|
||||
ordered_locators.append(locator)
|
||||
artifacts: list[dict[str, Any]] = []
|
||||
seen_paths: set[Path] = set()
|
||||
for locator in ordered_locators:
|
||||
path = _resolve_locator(locator)
|
||||
if not path or not path.exists() or not path.is_file():
|
||||
continue
|
||||
resolved = path.resolve()
|
||||
if resolved in seen_paths:
|
||||
continue
|
||||
seen_paths.add(resolved)
|
||||
artifact = _build_artifact_payload(locator, resolved)
|
||||
artifacts.append(artifact.model_dump(exclude_none=True))
|
||||
return artifacts
|
||||
|
||||
|
||||
def _build_artifact_payload(locator: str, path: Path) -> ArtifactPayload:
|
||||
mime_type = _guess_mime_type(path)
|
||||
previewable = _is_previewable(path, mime_type)
|
||||
encoded = quote(locator, safe="")
|
||||
preview_url = f"/nanobot/artifacts/preview?target={encoded}" if previewable else None
|
||||
return ArtifactPayload(
|
||||
name=path.name,
|
||||
mime_type=mime_type,
|
||||
size=path.stat().st_size,
|
||||
download_url=f"/nanobot/artifacts/download?target={encoded}",
|
||||
previewable=previewable,
|
||||
preview_url=preview_url,
|
||||
)
|
||||
|
||||
|
||||
def _guess_mime_type(path: Path) -> str:
|
||||
mime_type, _ = mimetypes.guess_type(path.name)
|
||||
return mime_type or "application/octet-stream"
|
||||
|
||||
|
||||
def _is_previewable(path: Path, mime_type: str) -> bool:
|
||||
if mime_type.startswith("image/") or mime_type.startswith("text/"):
|
||||
return True
|
||||
extension = path.suffix.lower()
|
||||
if extension in PREVIEWABLE_EXTENSIONS:
|
||||
return True
|
||||
return mime_type in {
|
||||
"application/pdf",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
}
|
||||
|
||||
|
||||
def _collect_candidate_texts(content: str, session_messages: list[dict[str, Any]]) -> list[str]:
|
||||
texts = [content or ""]
|
||||
if not session_messages:
|
||||
return texts
|
||||
last_user_idx = -1
|
||||
for idx, message in enumerate(session_messages):
|
||||
if message.get("role") == "user":
|
||||
last_user_idx = idx
|
||||
if last_user_idx == -1:
|
||||
segment = session_messages
|
||||
else:
|
||||
segment = session_messages[last_user_idx + 1 :]
|
||||
for message in segment:
|
||||
raw = message.get("content")
|
||||
flattened = _flatten_content(raw)
|
||||
if flattened:
|
||||
texts.append(flattened)
|
||||
return texts
|
||||
|
||||
|
||||
def _extract_locators(text: str) -> Iterable[str]:
|
||||
if not text:
|
||||
return []
|
||||
ordered: list[str] = []
|
||||
seen: set[str] = set()
|
||||
patterns = (LOCAL_URI_PATTERN, REPORT_PATH_PATTERN, PATH_PATTERN)
|
||||
for pattern in patterns:
|
||||
for match in pattern.findall(text):
|
||||
normalized = _normalize_locator(match)
|
||||
if not normalized or normalized in seen:
|
||||
continue
|
||||
seen.add(normalized)
|
||||
ordered.append(normalized)
|
||||
return ordered
|
||||
|
||||
|
||||
def _normalize_locator(raw_locator: str) -> str:
|
||||
locator = raw_locator.strip().strip("`'\"")
|
||||
locator = locator.rstrip(".,;:!?)]}")
|
||||
return locator
|
||||
|
||||
|
||||
def _resolve_locator(locator: str) -> Path | None:
|
||||
data_root = get_data_root()
|
||||
workspace_root = get_workspace_root()
|
||||
uploads_root = get_uploads_root()
|
||||
reports_root = get_reports_root()
|
||||
repo_root = data_root.parent
|
||||
if locator.startswith("local://"):
|
||||
raw_local = locator.replace("local://", "", 1).strip().lstrip("/\\")
|
||||
if not raw_local:
|
||||
return None
|
||||
candidate = Path(raw_local)
|
||||
if candidate.is_absolute():
|
||||
return candidate
|
||||
checks = [workspace_root / candidate, reports_root / candidate, uploads_root / candidate, uploads_root / candidate.name]
|
||||
for path in checks:
|
||||
if path.exists():
|
||||
return path
|
||||
return uploads_root / candidate.name
|
||||
normalized = locator.replace("\\", "/")
|
||||
path = Path(locator)
|
||||
if path.is_absolute():
|
||||
return path
|
||||
if normalized.startswith("data/data/"):
|
||||
return repo_root / normalized
|
||||
checks = [
|
||||
workspace_root / normalized,
|
||||
data_root / normalized,
|
||||
repo_root / normalized,
|
||||
]
|
||||
for candidate in checks:
|
||||
if candidate.exists():
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
def _flatten_content(value: Any) -> str:
|
||||
if value is None:
|
||||
return ""
|
||||
if isinstance(value, str):
|
||||
return value
|
||||
if isinstance(value, list):
|
||||
fragments: list[str] = []
|
||||
for item in value:
|
||||
flattened = _flatten_content(item)
|
||||
if flattened:
|
||||
fragments.append(flattened)
|
||||
return "\n".join(fragments)
|
||||
if isinstance(value, dict):
|
||||
fragments: list[str] = []
|
||||
text = value.get("text")
|
||||
if isinstance(text, str):
|
||||
fragments.append(text)
|
||||
content = value.get("content")
|
||||
if content is not None:
|
||||
nested = _flatten_content(content)
|
||||
if nested:
|
||||
fragments.append(nested)
|
||||
for field in ("path", "file", "file_path", "url"):
|
||||
data = value.get(field)
|
||||
if isinstance(data, str):
|
||||
fragments.append(data)
|
||||
return "\n".join(fragments)
|
||||
return str(value)
|
||||
@@ -0,0 +1,39 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
BACKEND_ROOT = Path(__file__).resolve().parents[2]
|
||||
REPO_ROOT = BACKEND_ROOT.parent
|
||||
DEFAULT_DATA_ROOT = REPO_ROOT / "data"
|
||||
LEGACY_DATA_ROOT = BACKEND_ROOT / "data"
|
||||
|
||||
|
||||
def get_data_root() -> Path:
|
||||
configured = (os.getenv("DATA_ROOT") or "").strip()
|
||||
if configured:
|
||||
return Path(configured).expanduser().resolve()
|
||||
if DEFAULT_DATA_ROOT.exists():
|
||||
return DEFAULT_DATA_ROOT
|
||||
if LEGACY_DATA_ROOT.exists():
|
||||
print(f"[DATA_ROOT] legacy path detected: {LEGACY_DATA_ROOT}. Please migrate to {DEFAULT_DATA_ROOT}.")
|
||||
return LEGACY_DATA_ROOT
|
||||
return DEFAULT_DATA_ROOT
|
||||
|
||||
|
||||
def get_workspace_root() -> Path:
|
||||
return get_data_root() / "workspace"
|
||||
|
||||
|
||||
def get_uploads_root() -> Path:
|
||||
return get_data_root() / "uploads"
|
||||
|
||||
|
||||
def get_reports_root() -> Path:
|
||||
return get_data_root() / "data"
|
||||
|
||||
|
||||
def ensure_data_layout() -> None:
|
||||
get_data_root().mkdir(parents=True, exist_ok=True)
|
||||
get_workspace_root().mkdir(parents=True, exist_ok=True)
|
||||
get_uploads_root().mkdir(parents=True, exist_ok=True)
|
||||
get_reports_root().mkdir(parents=True, exist_ok=True)
|
||||
@@ -0,0 +1,43 @@
|
||||
import smtplib
|
||||
import os
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
|
||||
def send_verification_email(to_email: str, token: str):
|
||||
smtp_host = os.getenv("SMTP_HOST", "smtp.qq.com")
|
||||
smtp_port = int(os.getenv("SMTP_PORT", "465"))
|
||||
smtp_user = os.getenv("SMTP_USER", "")
|
||||
smtp_password = os.getenv("SMTP_PASSWORD", "")
|
||||
frontend_url = os.getenv("FRONTEND_URL", "http://localhost:5173")
|
||||
|
||||
if not smtp_user or not smtp_password:
|
||||
print("SMTP configuration is missing. Skip sending email.")
|
||||
return
|
||||
|
||||
msg = MIMEMultipart()
|
||||
msg['From'] = smtp_user
|
||||
msg['To'] = to_email
|
||||
msg['Subject'] = "请验证你的邮箱地址"
|
||||
|
||||
verify_link = f"{frontend_url}/verify-email?token={token}"
|
||||
body = f"""
|
||||
<html>
|
||||
<body>
|
||||
<h2>欢迎使用全源灵动!</h2>
|
||||
<p>请点击下方链接验证邮箱并激活账号:</p>
|
||||
<p><a href="{verify_link}">{verify_link}</a></p>
|
||||
<p>如果你没有发起该请求,请忽略此邮件。</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
msg.attach(MIMEText(body, 'html'))
|
||||
|
||||
try:
|
||||
# Use SMTP_SSL for port 465
|
||||
server = smtplib.SMTP_SSL(smtp_host, smtp_port)
|
||||
server.login(smtp_user, smtp_password)
|
||||
server.send_message(msg)
|
||||
server.quit()
|
||||
print(f"Verification email sent to {to_email}")
|
||||
except Exception as e:
|
||||
print(f"Failed to send email: {e}")
|
||||
@@ -0,0 +1,96 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from app.core.data_root import (
|
||||
BACKEND_ROOT,
|
||||
LEGACY_DATA_ROOT,
|
||||
get_data_root,
|
||||
get_reports_root,
|
||||
get_uploads_root,
|
||||
get_workspace_root,
|
||||
)
|
||||
|
||||
|
||||
data_root = get_data_root()
|
||||
workspace_root = get_workspace_root()
|
||||
uploads_root = get_uploads_root()
|
||||
reports_root = get_reports_root()
|
||||
legacy_workspace_root = LEGACY_DATA_ROOT / "workspace"
|
||||
legacy_uploads_root = LEGACY_DATA_ROOT / "uploads"
|
||||
legacy_reports_root = LEGACY_DATA_ROOT / "data"
|
||||
backend_root = BACKEND_ROOT
|
||||
allowed_artifact_roots = (
|
||||
workspace_root,
|
||||
uploads_root,
|
||||
reports_root,
|
||||
legacy_workspace_root,
|
||||
legacy_uploads_root,
|
||||
legacy_reports_root,
|
||||
)
|
||||
|
||||
|
||||
def resolve_upload_file_path(file_url: Optional[str]) -> Path:
|
||||
if not file_url:
|
||||
raise ValueError("File URL is empty")
|
||||
|
||||
if file_url.startswith("local://"):
|
||||
raw_name = file_url.replace("local://", "", 1)
|
||||
safe_name = os.path.basename(raw_name)
|
||||
file_path = uploads_root / safe_name
|
||||
return file_path
|
||||
|
||||
return Path(file_url)
|
||||
|
||||
|
||||
def resolve_artifact_target(target: str) -> Path | None:
|
||||
locator = (target or "").strip().strip("'\"")
|
||||
if not locator:
|
||||
return None
|
||||
if locator.startswith("local://"):
|
||||
raw_local = locator.replace("local://", "", 1).strip().lstrip("/\\")
|
||||
if not raw_local:
|
||||
return None
|
||||
candidate = Path(raw_local)
|
||||
if candidate.is_absolute():
|
||||
return candidate
|
||||
checks = (
|
||||
workspace_root / candidate,
|
||||
reports_root / candidate,
|
||||
uploads_root / candidate,
|
||||
uploads_root / candidate.name,
|
||||
)
|
||||
for path in checks:
|
||||
if path.exists():
|
||||
return path
|
||||
return uploads_root / candidate.name
|
||||
normalized = locator.replace("\\", "/")
|
||||
path = Path(locator)
|
||||
if path.is_absolute():
|
||||
return path
|
||||
if normalized.startswith("data/data/"):
|
||||
return data_root.parent / normalized
|
||||
checks = (
|
||||
workspace_root / normalized,
|
||||
data_root / normalized,
|
||||
backend_root / normalized,
|
||||
)
|
||||
for candidate in checks:
|
||||
if candidate.exists():
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
def ensure_artifact_access(path: Path, *, require_file: bool = True) -> Path:
|
||||
try:
|
||||
resolved = path.resolve(strict=True)
|
||||
except FileNotFoundError as exc:
|
||||
raise FileNotFoundError("目标文件不存在") from exc
|
||||
if require_file and not resolved.is_file():
|
||||
raise FileNotFoundError("目标文件不存在")
|
||||
if not require_file and not resolved.is_dir():
|
||||
raise FileNotFoundError("目标目录不存在")
|
||||
for root in allowed_artifact_roots:
|
||||
if resolved.is_relative_to(root.resolve()):
|
||||
return resolved
|
||||
raise PermissionError("非法路径访问")
|
||||
@@ -0,0 +1,87 @@
|
||||
import os
|
||||
from typing import Optional, Dict
|
||||
|
||||
from nanobot.providers.azure_openai_provider import AzureOpenAIProvider
|
||||
from nanobot.providers.openai_codex_provider import OpenAICodexProvider
|
||||
from nanobot.providers.registry import find_by_name
|
||||
|
||||
from app.core.patched_openai_compat_provider import PatchedOpenAICompatProvider
|
||||
|
||||
|
||||
def normalize_provider_name(provider: Optional[str]) -> Optional[str]:
|
||||
if not provider:
|
||||
return None
|
||||
normalized = provider.strip().lower()
|
||||
alias_map = {
|
||||
"azure": "azure_openai",
|
||||
"local": "vllm",
|
||||
}
|
||||
return alias_map.get(normalized, normalized)
|
||||
|
||||
|
||||
def _running_in_docker() -> bool:
|
||||
# Best-effort, cross-platform detection.
|
||||
if os.environ.get("DATACLAW_RUNNING_IN_DOCKER", "").strip().lower() in ("1", "true", "yes", "y"):
|
||||
return True
|
||||
return os.path.exists("/.dockerenv")
|
||||
|
||||
|
||||
def _rewrite_localhost_api_base(api_base: Optional[str]) -> Optional[str]:
|
||||
"""
|
||||
When running inside Docker, `localhost` points to the container itself.
|
||||
For host-local LLMs (Ollama/vLLM), users often configure `http://localhost:...`,
|
||||
which breaks in containers. We rewrite it to `host.docker.internal`.
|
||||
"""
|
||||
if not api_base:
|
||||
return api_base
|
||||
base = api_base.strip()
|
||||
if base.startswith("http://localhost") or base.startswith("https://localhost"):
|
||||
return base.replace("://localhost", "://host.docker.internal", 1)
|
||||
if base.startswith("http://127.0.0.1") or base.startswith("https://127.0.0.1"):
|
||||
return base.replace("://127.0.0.1", "://host.docker.internal", 1)
|
||||
return api_base
|
||||
|
||||
|
||||
def build_llm_provider(
|
||||
*,
|
||||
model: str,
|
||||
provider: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
extra_headers: Optional[Dict[str, str]] = None,
|
||||
):
|
||||
provider_name = normalize_provider_name(provider)
|
||||
spec = find_by_name(provider_name) if provider_name else None
|
||||
backend = spec.backend if spec else "openai_compat"
|
||||
if _running_in_docker():
|
||||
api_base = _rewrite_localhost_api_base(api_base)
|
||||
|
||||
if backend == "openai_codex" or model.startswith("openai-codex/"):
|
||||
return OpenAICodexProvider(default_model=model)
|
||||
|
||||
if backend == "azure_openai":
|
||||
if not api_key or not api_base:
|
||||
raise ValueError("Azure OpenAI requires api_key and api_base.")
|
||||
return AzureOpenAIProvider(
|
||||
api_key=api_key,
|
||||
api_base=api_base,
|
||||
default_model=model,
|
||||
)
|
||||
|
||||
if backend == "anthropic":
|
||||
from nanobot.providers.anthropic_provider import AnthropicProvider
|
||||
|
||||
return AnthropicProvider(
|
||||
api_key=api_key,
|
||||
api_base=api_base,
|
||||
default_model=model,
|
||||
extra_headers=extra_headers,
|
||||
)
|
||||
|
||||
return PatchedOpenAICompatProvider(
|
||||
api_key=api_key,
|
||||
api_base=api_base,
|
||||
default_model=model,
|
||||
extra_headers=extra_headers,
|
||||
spec=spec,
|
||||
)
|
||||
@@ -0,0 +1,499 @@
|
||||
import asyncio
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List, Callable, Awaitable, Any, Dict
|
||||
|
||||
# Add project root to sys.path to allow importing nanobot
|
||||
# Assuming backend/app/core/nanobot.py -> backend/app/core -> backend/app -> backend -> root
|
||||
# This path calculation seems correct for backend/app/core/nanobot.py relative to backend/
|
||||
# BUT nanobot package is in ../nanobot relative to backend/
|
||||
# So we need to go up one more level to reach the parent of backend/
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[3]
|
||||
if str(PROJECT_ROOT / "agent-core") not in sys.path:
|
||||
sys.path.append(str(PROJECT_ROOT / "agent-core"))
|
||||
|
||||
from nanobot.agent.loop import AgentLoop
|
||||
from nanobot.bus.events import OutboundMessage
|
||||
from nanobot.bus.queue import MessageBus
|
||||
from nanobot.config.loader import load_config
|
||||
from nanobot.cron.service import CronService
|
||||
from nanobot.providers.openai_codex_provider import OpenAICodexProvider
|
||||
from nanobot.providers.azure_openai_provider import AzureOpenAIProvider
|
||||
from nanobot.providers.base import GenerationSettings
|
||||
from nanobot.providers.registry import find_by_name
|
||||
from nanobot.session.manager import SessionManager
|
||||
from nanobot.config.schema import Config
|
||||
|
||||
# Import skills loader
|
||||
# We use a lazy import inside the method to avoid potential circular dependencies if any arise,
|
||||
# or just import here if we are confident.
|
||||
# Given the structure, importing here should be fine as long as skills.py doesn't import nanobot.py.
|
||||
from app.api.skills import load_skills
|
||||
from app.core.patched_openai_compat_provider import PatchedOpenAICompatProvider
|
||||
from app.core.llm_provider import _rewrite_localhost_api_base, _running_in_docker
|
||||
from app.services.llm_cache import get_llm_configs, get_active_llm_config
|
||||
from app.services.web_search_config_store import get_web_search_config
|
||||
|
||||
from app.core.data_root import get_workspace_root
|
||||
from app.trace import build_error_attributes, build_usage_attributes, trace_service
|
||||
|
||||
class NanobotIntegration:
|
||||
def __init__(self):
|
||||
self.agent: AgentLoop | None = None
|
||||
self.bus: MessageBus | None = None
|
||||
self.cron: CronService | None = None
|
||||
self.config: Config | None = None
|
||||
self._started = False
|
||||
self._model_agent_cache: Dict[tuple[str | None, int | None], AgentLoop] = {}
|
||||
self._model_agent_lock = asyncio.Lock()
|
||||
self._last_usage_by_session: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
@staticmethod
|
||||
def _normalize_config_value(value: Any) -> Any:
|
||||
if isinstance(value, str):
|
||||
stripped = value.strip()
|
||||
return stripped or None
|
||||
return value
|
||||
|
||||
@staticmethod
|
||||
def _normalize_model_id(value: Any) -> str | None:
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, str):
|
||||
stripped = value.strip()
|
||||
return stripped or None
|
||||
return str(value)
|
||||
|
||||
@staticmethod
|
||||
def _extract_response_text(response: Any) -> str:
|
||||
if response is None:
|
||||
return ""
|
||||
if isinstance(response, str):
|
||||
return response
|
||||
if isinstance(response, OutboundMessage):
|
||||
return response.content or ""
|
||||
if isinstance(response, dict):
|
||||
content = response.get("content")
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
return str(content or "")
|
||||
content = getattr(response, "content", None)
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
return str(response)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_usage(usage: Any) -> Dict[str, int] | None:
|
||||
if not isinstance(usage, dict):
|
||||
return None
|
||||
normalized: Dict[str, int] = {}
|
||||
prompt = int(usage.get("prompt_tokens", 0) or 0)
|
||||
completion = int(usage.get("completion_tokens", 0) or 0)
|
||||
total = int(usage.get("total_tokens", 0) or 0)
|
||||
|
||||
# If total_tokens is missing or zero, calculate it
|
||||
if total == 0:
|
||||
total = prompt + completion
|
||||
|
||||
normalized["prompt_tokens"] = prompt
|
||||
normalized["completion_tokens"] = completion
|
||||
normalized["total_tokens"] = total
|
||||
return normalized if (prompt > 0 or completion > 0) else None
|
||||
|
||||
def get_last_usage(self, session_id: str) -> Dict[str, int] | None:
|
||||
usage = self._last_usage_by_session.get(session_id)
|
||||
return dict(usage) if usage else None
|
||||
|
||||
def _get_web_search_config(self) -> Any:
|
||||
from nanobot.config.schema import WebSearchConfig
|
||||
ws_dict = get_web_search_config()
|
||||
return WebSearchConfig(
|
||||
provider=ws_dict.get("provider", "duckduckgo"),
|
||||
api_key=ws_dict.get("api_key", ""),
|
||||
base_url=ws_dict.get("base_url", ""),
|
||||
max_results=ws_dict.get("max_results", 5)
|
||||
)
|
||||
|
||||
def _need_custom_agent_for_target(self, target_config: Dict[str, Any]) -> bool:
|
||||
if not self.agent:
|
||||
return False
|
||||
|
||||
provider = self.agent.provider
|
||||
target_model = self._normalize_config_value(target_config.get("model"))
|
||||
current_model = self._normalize_config_value(
|
||||
getattr(self.agent, "model", None) or getattr(provider, "default_model", None)
|
||||
)
|
||||
if target_model != current_model:
|
||||
return True
|
||||
|
||||
target_provider = self._normalize_config_value(target_config.get("provider"))
|
||||
current_provider = self._normalize_config_value(getattr(provider, "_provider_name_override", None))
|
||||
if not current_provider:
|
||||
current_provider = self._normalize_config_value(getattr(getattr(provider, "_spec", None), "name", None))
|
||||
if not current_provider and current_model and self.config:
|
||||
current_provider = self._normalize_config_value(self.config.get_provider_name(current_model))
|
||||
if target_provider != current_provider:
|
||||
return True
|
||||
|
||||
target_api_base = self._normalize_config_value(target_config.get("api_base"))
|
||||
current_api_base = self._normalize_config_value(getattr(provider, "api_base", None))
|
||||
if target_api_base != current_api_base:
|
||||
return True
|
||||
|
||||
target_api_key = self._normalize_config_value(target_config.get("api_key"))
|
||||
current_api_key = self._normalize_config_value(getattr(provider, "api_key", None))
|
||||
if target_api_key != current_api_key:
|
||||
return True
|
||||
|
||||
target_headers = target_config.get("extra_headers") or {}
|
||||
current_headers = getattr(provider, "extra_headers", None) or {}
|
||||
return target_headers != current_headers
|
||||
|
||||
def initialize(self):
|
||||
workspace_path = get_workspace_root()
|
||||
workspace_path.mkdir(parents=True, exist_ok=True)
|
||||
self._sync_builtin_skills_to_workspace(workspace_path)
|
||||
|
||||
# Override config workspace path via environment variable (since config is loaded from env)
|
||||
os.environ["NANOBOT_AGENTS__DEFAULTS__WORKSPACE"] = str(workspace_path)
|
||||
|
||||
self.config = load_config()
|
||||
# No need to set self.config.workspace_path as it's a property that reads from agents.defaults.workspace
|
||||
|
||||
self.bus = MessageBus()
|
||||
active_config = get_active_llm_config()
|
||||
initial_model = self.config.agents.defaults.model
|
||||
if active_config and active_config.get("model"):
|
||||
provider = self._make_provider_from_target(active_config)
|
||||
initial_model = self._normalize_config_value(active_config.get("model")) or initial_model
|
||||
else:
|
||||
provider = self._make_provider(self.config)
|
||||
|
||||
cron_store_path = workspace_path / "cron"
|
||||
cron_store_path.mkdir(parents=True, exist_ok=True)
|
||||
cron_store_file = cron_store_path / "jobs.json"
|
||||
|
||||
self.cron = CronService(cron_store_file)
|
||||
|
||||
session_manager = SessionManager(self.config.workspace_path)
|
||||
|
||||
self.agent = AgentLoop(
|
||||
bus=self.bus,
|
||||
provider=provider,
|
||||
workspace=self.config.workspace_path,
|
||||
model=initial_model,
|
||||
max_iterations=self.config.agents.defaults.max_tool_iterations,
|
||||
context_window_tokens=self.config.agents.defaults.context_window_tokens,
|
||||
web_search_config=self._get_web_search_config(),
|
||||
web_proxy=self.config.tools.web.proxy or None,
|
||||
exec_config=self.config.tools.exec,
|
||||
cron_service=self.cron,
|
||||
restrict_to_workspace=self.config.tools.restrict_to_workspace,
|
||||
session_manager=session_manager,
|
||||
mcp_servers=self.config.tools.mcp_servers,
|
||||
channels_config=self.config.channels,
|
||||
timezone=self.config.agents.defaults.timezone,
|
||||
)
|
||||
|
||||
self._register_custom_tools(self.agent)
|
||||
|
||||
def _sync_builtin_skills_to_workspace(self, workspace_path: Path) -> None:
|
||||
builtin_root = Path(__file__).resolve().parents[1] / "skills_builtin"
|
||||
workspace_skills_root = workspace_path / "skills"
|
||||
workspace_skills_root.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for skill_name in ("nl2sql", "visualization", "knowledge-base"):
|
||||
source_dir = builtin_root / skill_name
|
||||
source_skill_file = source_dir / "SKILL.md"
|
||||
if not source_skill_file.exists():
|
||||
continue
|
||||
target_dir = workspace_skills_root / skill_name
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(source_skill_file, target_dir / "SKILL.md")
|
||||
|
||||
def _register_custom_tools(self, agent: AgentLoop, project_id: int | None = None):
|
||||
from app.tools.nl2sql import NL2SQLTool
|
||||
from app.tools.visualization import VisualizationTool
|
||||
from app.tools.get_schema import GetDatabaseSchemaTool
|
||||
from app.tools.knowledge_base import KnowledgeBaseRetrieveTool
|
||||
from app.tools.subagent import ListSubagentsTool, InvokeSubagentTool
|
||||
agent.tools.register(NL2SQLTool())
|
||||
agent.tools.register(VisualizationTool())
|
||||
agent.tools.register(GetDatabaseSchemaTool())
|
||||
agent.tools.register(KnowledgeBaseRetrieveTool())
|
||||
agent.tools.register(ListSubagentsTool(project_id=project_id))
|
||||
agent.tools.register(InvokeSubagentTool(project_id=project_id))
|
||||
|
||||
def _build_provider(
|
||||
self,
|
||||
model: str,
|
||||
provider_name: str | None,
|
||||
api_key: str | None,
|
||||
api_base: str | None,
|
||||
extra_headers: dict[str, Any] | None = None,
|
||||
):
|
||||
spec = find_by_name(provider_name) if provider_name else None
|
||||
backend = spec.backend if spec else "openai_compat"
|
||||
if _running_in_docker():
|
||||
api_base = _rewrite_localhost_api_base(api_base)
|
||||
|
||||
if backend == "openai_codex" or model.startswith("openai-codex/"):
|
||||
return OpenAICodexProvider(default_model=model)
|
||||
|
||||
if backend == "azure_openai":
|
||||
if not api_key or not api_base:
|
||||
raise ValueError("Azure OpenAI requires api_key and api_base.")
|
||||
return AzureOpenAIProvider(
|
||||
api_key=api_key,
|
||||
api_base=api_base,
|
||||
default_model=model,
|
||||
)
|
||||
|
||||
if backend == "anthropic":
|
||||
from nanobot.providers.anthropic_provider import AnthropicProvider
|
||||
return AnthropicProvider(
|
||||
api_key=api_key,
|
||||
api_base=api_base,
|
||||
default_model=model,
|
||||
extra_headers=extra_headers,
|
||||
)
|
||||
|
||||
return PatchedOpenAICompatProvider(
|
||||
api_key=api_key,
|
||||
api_base=api_base,
|
||||
default_model=model,
|
||||
extra_headers=extra_headers,
|
||||
spec=spec,
|
||||
)
|
||||
|
||||
def _make_provider(self, config: Config):
|
||||
model = config.agents.defaults.model
|
||||
provider_name = config.get_provider_name(model)
|
||||
p = config.get_provider(model)
|
||||
provider = self._build_provider(
|
||||
model=model,
|
||||
provider_name=provider_name,
|
||||
api_key=p.api_key if p else None,
|
||||
api_base=config.get_api_base(model),
|
||||
extra_headers=p.extra_headers if p else None,
|
||||
)
|
||||
provider.generation = GenerationSettings(
|
||||
temperature=config.agents.defaults.temperature,
|
||||
max_tokens=config.agents.defaults.max_tokens,
|
||||
reasoning_effort=config.agents.defaults.reasoning_effort,
|
||||
)
|
||||
return provider
|
||||
|
||||
def _make_provider_from_target(self, target_config: Dict[str, Any]):
|
||||
model = self._normalize_config_value(target_config.get("model")) or self.config.agents.defaults.model
|
||||
provider_name = self._normalize_config_value(target_config.get("provider"))
|
||||
if not provider_name and model and self.config:
|
||||
provider_name = self._normalize_config_value(self.config.get_provider_name(model))
|
||||
provider = self._build_provider(
|
||||
model=model,
|
||||
provider_name=provider_name,
|
||||
api_key=self._normalize_config_value(target_config.get("api_key")),
|
||||
api_base=self._normalize_config_value(target_config.get("api_base")),
|
||||
extra_headers=target_config.get("extra_headers"),
|
||||
)
|
||||
provider.generation = GenerationSettings(
|
||||
temperature=self.config.agents.defaults.temperature,
|
||||
max_tokens=self.config.agents.defaults.max_tokens,
|
||||
reasoning_effort=self.config.agents.defaults.reasoning_effort,
|
||||
)
|
||||
return provider
|
||||
|
||||
async def start(self):
|
||||
if self._started:
|
||||
return
|
||||
if not self.agent:
|
||||
self.initialize()
|
||||
asyncio.create_task(self.agent.run())
|
||||
asyncio.create_task(self.cron.start())
|
||||
self._started = True
|
||||
|
||||
async def stop(self):
|
||||
if self.agent:
|
||||
self.agent.stop()
|
||||
await self.agent.close_mcp()
|
||||
for agent in self._model_agent_cache.values():
|
||||
agent.stop()
|
||||
await agent.close_mcp()
|
||||
self._model_agent_cache.clear()
|
||||
if self.cron:
|
||||
self.cron.stop()
|
||||
self._started = False
|
||||
|
||||
def _build_agent_for_provider(self, provider: Any, mcp_servers: dict | None = None) -> AgentLoop:
|
||||
return AgentLoop(
|
||||
bus=self.bus,
|
||||
provider=provider,
|
||||
workspace=self.config.workspace_path,
|
||||
model=provider.default_model,
|
||||
max_iterations=self.config.agents.defaults.max_tool_iterations,
|
||||
context_window_tokens=self.config.agents.defaults.context_window_tokens,
|
||||
web_search_config=self._get_web_search_config(),
|
||||
web_proxy=self.config.tools.web.proxy or None,
|
||||
exec_config=self.config.tools.exec,
|
||||
cron_service=self.cron,
|
||||
restrict_to_workspace=self.config.tools.restrict_to_workspace,
|
||||
session_manager=self.agent.sessions if self.agent else None,
|
||||
mcp_servers=mcp_servers if mcp_servers is not None else self.config.tools.mcp_servers,
|
||||
channels_config=self.config.channels,
|
||||
timezone=self.config.agents.defaults.timezone,
|
||||
)
|
||||
|
||||
async def _get_or_create_model_agent(self, model_id: str | None, target_config: Dict[str, Any] | None, project_id: int | None = None) -> AgentLoop:
|
||||
normalized_model_id = self._normalize_model_id(model_id)
|
||||
cache_key = (normalized_model_id, project_id)
|
||||
async with self._model_agent_lock:
|
||||
cached = self._model_agent_cache.get(cache_key)
|
||||
if cached:
|
||||
return cached
|
||||
|
||||
if target_config:
|
||||
provider = self._make_provider_from_target(target_config)
|
||||
else:
|
||||
provider = self._make_provider(self.config)
|
||||
|
||||
mcp_servers_dict = dict(self.config.tools.mcp_servers) if self.config.tools.mcp_servers else {}
|
||||
if project_id is not None:
|
||||
from app.api.mcp import list_mcp_servers
|
||||
from nanobot.config.schema import MCPServerConfig
|
||||
servers = await list_mcp_servers(project_id=project_id)
|
||||
for s in servers:
|
||||
cfg = MCPServerConfig(
|
||||
type=s.get("type"),
|
||||
command=s.get("command") or "",
|
||||
args=s.get("args") or [],
|
||||
env=s.get("env") or {},
|
||||
url=s.get("url") or "",
|
||||
headers=s.get("headers") or {}
|
||||
)
|
||||
mcp_servers_dict[s["name"]] = cfg
|
||||
|
||||
agent = self._build_agent_for_provider(provider, mcp_servers=mcp_servers_dict)
|
||||
self._register_custom_tools(agent, project_id=project_id)
|
||||
self._model_agent_cache[cache_key] = agent
|
||||
return agent
|
||||
|
||||
async def process_message(
|
||||
self,
|
||||
message: str,
|
||||
session_id: str = "api:default",
|
||||
skill_ids: List[str] | None = None,
|
||||
model_id: str | None = None,
|
||||
project_id: int | None = None,
|
||||
on_progress: Callable[[str], Awaitable[None]] | None = None,
|
||||
on_stream: Callable[[str], Awaitable[None]] | None = None,
|
||||
):
|
||||
span_attributes = {
|
||||
"session_id": session_id,
|
||||
"project_id": project_id,
|
||||
"model_id": model_id,
|
||||
"component": "nanobot.process_message",
|
||||
}
|
||||
with trace_service.start_span(
|
||||
"nanobot.process_message",
|
||||
attributes=span_attributes,
|
||||
input_payload={"message": message},
|
||||
) as root_span:
|
||||
try:
|
||||
if not self.agent:
|
||||
self.initialize()
|
||||
if not self._started:
|
||||
await self.start()
|
||||
|
||||
if project_id is None:
|
||||
from app.core.session_alias_store import session_alias_store
|
||||
|
||||
alias_meta = session_alias_store.get_alias_meta(session_id)
|
||||
if alias_meta and alias_meta.get("project_id") is not None:
|
||||
project_id = alias_meta.get("project_id")
|
||||
root_span.set_attributes({"project_id": project_id})
|
||||
|
||||
agent_to_use = self.agent
|
||||
need_custom_agent = False
|
||||
target_config = None
|
||||
|
||||
selected_model_id = self._normalize_model_id(model_id)
|
||||
if selected_model_id:
|
||||
llm_configs = get_llm_configs()
|
||||
target_config = next(
|
||||
(item for item in llm_configs if self._normalize_model_id(item.get("id")) == selected_model_id),
|
||||
None,
|
||||
)
|
||||
|
||||
if target_config is None:
|
||||
active_config = get_active_llm_config()
|
||||
if active_config and active_config.get("id"):
|
||||
selected_model_id = self._normalize_model_id(active_config.get("id"))
|
||||
target_config = active_config
|
||||
|
||||
if target_config and self._need_custom_agent_for_target(target_config):
|
||||
need_custom_agent = True
|
||||
if project_id is not None:
|
||||
need_custom_agent = True
|
||||
|
||||
with trace_service.start_span(
|
||||
"nanobot.resolve_agent",
|
||||
attributes={
|
||||
"session_id": session_id,
|
||||
"project_id": project_id,
|
||||
"selected_model_id": selected_model_id,
|
||||
"custom_agent": need_custom_agent,
|
||||
},
|
||||
):
|
||||
if need_custom_agent:
|
||||
agent_to_use = await self._get_or_create_model_agent(selected_model_id, target_config, project_id)
|
||||
|
||||
session = agent_to_use.sessions.get_or_create(session_id)
|
||||
normalized_messages = self._normalize_session_messages(session.messages)
|
||||
if len(normalized_messages) != len(session.messages):
|
||||
session.messages = normalized_messages
|
||||
agent_to_use.sessions.save(session)
|
||||
|
||||
with trace_service.start_span(
|
||||
"nanobot.process_direct",
|
||||
attributes={
|
||||
"session_id": session_id,
|
||||
"model": getattr(agent_to_use, "model", None),
|
||||
},
|
||||
) as direct_span:
|
||||
response = await agent_to_use.process_direct(
|
||||
message,
|
||||
session_key=session_id,
|
||||
channel="api",
|
||||
chat_id=session_id,
|
||||
on_progress=on_progress,
|
||||
on_stream=on_stream,
|
||||
)
|
||||
usage = self._normalize_usage(getattr(agent_to_use, "_last_usage", None))
|
||||
if usage:
|
||||
self._last_usage_by_session[session_id] = usage
|
||||
direct_span.set_attributes(build_usage_attributes(usage))
|
||||
root_span.set_attributes(build_usage_attributes(usage))
|
||||
text = self._extract_response_text(response)
|
||||
direct_span.update(output={"content": text})
|
||||
root_span.update(output={"content": text})
|
||||
return text
|
||||
except Exception as exc:
|
||||
root_span.set_attributes(build_error_attributes(exc, stage="nanobot_process_message"))
|
||||
root_span.record_error(exc, stage="nanobot_process_message")
|
||||
raise
|
||||
|
||||
def _normalize_session_messages(self, messages: List[Any]) -> List[dict[str, Any]]:
|
||||
normalized: List[dict[str, Any]] = []
|
||||
stack: List[Any] = list(messages)
|
||||
while stack:
|
||||
current = stack.pop(0)
|
||||
if isinstance(current, dict):
|
||||
normalized.append(current)
|
||||
continue
|
||||
if isinstance(current, list):
|
||||
stack = list(current) + stack
|
||||
return normalized
|
||||
|
||||
nanobot_service = NanobotIntegration()
|
||||
@@ -0,0 +1,43 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from nanobot.providers.openai_compat_provider import OpenAICompatProvider
|
||||
|
||||
|
||||
class PatchedOpenAICompatProvider(OpenAICompatProvider):
|
||||
_MAX_COMPLETION_TOKEN_MODELS = ("gpt-5", "o1", "o3", "o4")
|
||||
|
||||
def _build_kwargs(
|
||||
self,
|
||||
messages: list[dict[str, Any]],
|
||||
tools: list[dict[str, Any]] | None,
|
||||
model: str | None,
|
||||
max_tokens: int,
|
||||
temperature: float,
|
||||
reasoning_effort: str | None,
|
||||
tool_choice: str | dict[str, Any] | None,
|
||||
) -> dict[str, Any]:
|
||||
kwargs = super()._build_kwargs(
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
model=model,
|
||||
max_tokens=max_tokens,
|
||||
temperature=temperature,
|
||||
reasoning_effort=reasoning_effort,
|
||||
tool_choice=tool_choice,
|
||||
)
|
||||
|
||||
model_name = (model or self.default_model or "").lower()
|
||||
spec = self._spec
|
||||
supports_max_completion_tokens = bool(
|
||||
spec and getattr(spec, "supports_max_completion_tokens", False)
|
||||
)
|
||||
should_use_max_completion_tokens = supports_max_completion_tokens or any(
|
||||
token in model_name for token in self._MAX_COMPLETION_TOKEN_MODELS
|
||||
)
|
||||
|
||||
if should_use_max_completion_tokens and "max_tokens" in kwargs:
|
||||
kwargs["max_completion_tokens"] = kwargs.pop("max_tokens")
|
||||
|
||||
return kwargs
|
||||
@@ -0,0 +1,56 @@
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional
|
||||
from jose import jwt, JWTError
|
||||
from passlib.context import CryptContext
|
||||
from fastapi import HTTPException, Depends, status
|
||||
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
||||
from pydantic import BaseModel
|
||||
|
||||
SECRET_KEY = "your-super-secret-key-for-dataclaw" # In production, use env variable
|
||||
ALGORITHM = "HS256"
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES = 30 * 24 * 60 # 30 days
|
||||
|
||||
pwd_context = CryptContext(schemes=["pbkdf2_sha256"], deprecated="auto")
|
||||
security = HTTPBearer()
|
||||
|
||||
class CurrentUser(BaseModel):
|
||||
id: int
|
||||
username: str
|
||||
is_admin: bool = False
|
||||
|
||||
def verify_password(plain_password, hashed_password):
|
||||
return pwd_context.verify(plain_password, hashed_password)
|
||||
|
||||
def get_password_hash(password):
|
||||
return pwd_context.hash(password)
|
||||
|
||||
def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
|
||||
to_encode = data.copy()
|
||||
if expires_delta:
|
||||
expire = datetime.utcnow() + expires_delta
|
||||
else:
|
||||
expire = datetime.utcnow() + timedelta(minutes=15)
|
||||
to_encode.update({"exp": expire})
|
||||
encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
|
||||
return encoded_jwt
|
||||
|
||||
def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(security)) -> CurrentUser:
|
||||
unauthorized = HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Invalid authentication credentials",
|
||||
)
|
||||
try:
|
||||
payload = jwt.decode(credentials.credentials, SECRET_KEY, algorithms=[ALGORITHM])
|
||||
except JWTError:
|
||||
raise unauthorized
|
||||
user_id = payload.get("id")
|
||||
username = payload.get("sub")
|
||||
is_admin = bool(payload.get("is_admin", False))
|
||||
if user_id is None or username is None:
|
||||
raise unauthorized
|
||||
return CurrentUser(id=user_id, username=username, is_admin=is_admin)
|
||||
|
||||
def get_admin_user(current_user: CurrentUser = Depends(get_current_user)) -> CurrentUser:
|
||||
if not current_user.is_admin:
|
||||
raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Admin permission required")
|
||||
return current_user
|
||||
@@ -0,0 +1,215 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.core.data_root import get_data_root
|
||||
|
||||
|
||||
class SessionAliasStore:
|
||||
def __init__(self) -> None:
|
||||
data_dir = get_data_root()
|
||||
try:
|
||||
data_dir.mkdir(parents=True, exist_ok=True)
|
||||
except PermissionError as exc:
|
||||
raise RuntimeError(f"DATA_ROOT 权限不足: {data_dir}") from exc
|
||||
self.db_path = data_dir / "nanobot_sessions.db"
|
||||
try:
|
||||
self._init_db()
|
||||
except PermissionError as exc:
|
||||
raise RuntimeError(f"DATA_ROOT 权限不足: {data_dir}") from exc
|
||||
|
||||
def _connect(self) -> sqlite3.Connection:
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
def _init_db(self) -> None:
|
||||
with self._connect() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS session_cache (
|
||||
session_key TEXT PRIMARY KEY,
|
||||
created_at TEXT,
|
||||
updated_at TEXT,
|
||||
alias TEXT,
|
||||
pinned INTEGER NOT NULL DEFAULT 0,
|
||||
archived INTEGER NOT NULL DEFAULT 0,
|
||||
last_seen_at TEXT NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
cols = {
|
||||
str(row["name"])
|
||||
for row in conn.execute("PRAGMA table_info(session_cache)").fetchall()
|
||||
}
|
||||
if "pinned" not in cols:
|
||||
conn.execute("ALTER TABLE session_cache ADD COLUMN pinned INTEGER NOT NULL DEFAULT 0")
|
||||
if "archived" not in cols:
|
||||
conn.execute("ALTER TABLE session_cache ADD COLUMN archived INTEGER NOT NULL DEFAULT 0")
|
||||
if "project_id" not in cols:
|
||||
conn.execute("ALTER TABLE session_cache ADD COLUMN project_id INTEGER")
|
||||
|
||||
def sync_sessions(self, sessions: list[dict[str, Any]]) -> None:
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
keys: list[str] = []
|
||||
with self._connect() as conn:
|
||||
for item in sessions:
|
||||
key = str(item.get("key") or "").strip()
|
||||
if not key:
|
||||
continue
|
||||
keys.append(key)
|
||||
created_at = str(item.get("created_at") or "")
|
||||
updated_at = str(item.get("updated_at") or "")
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO session_cache (session_key, created_at, updated_at, last_seen_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
ON CONFLICT(session_key) DO UPDATE SET
|
||||
created_at = excluded.created_at,
|
||||
updated_at = excluded.updated_at,
|
||||
last_seen_at = excluded.last_seen_at
|
||||
""",
|
||||
(key, created_at, updated_at, now),
|
||||
)
|
||||
|
||||
if keys:
|
||||
placeholders = ",".join("?" for _ in keys)
|
||||
conn.execute(
|
||||
f"DELETE FROM session_cache WHERE session_key NOT IN ({placeholders})",
|
||||
keys,
|
||||
)
|
||||
else:
|
||||
conn.execute("DELETE FROM session_cache")
|
||||
|
||||
def list_cached_sessions(self, project_id: int | None = None) -> list[dict[str, Any]]:
|
||||
with self._connect() as conn:
|
||||
if project_id is not None:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT session_key, created_at, updated_at, alias, pinned, archived, project_id
|
||||
FROM session_cache
|
||||
WHERE project_id = ? OR project_id IS NULL
|
||||
ORDER BY pinned DESC, archived ASC, updated_at DESC
|
||||
""",
|
||||
(project_id,)
|
||||
).fetchall()
|
||||
else:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
SELECT session_key, created_at, updated_at, alias, pinned, archived, project_id
|
||||
FROM session_cache
|
||||
ORDER BY pinned DESC, archived ASC, updated_at DESC
|
||||
"""
|
||||
).fetchall()
|
||||
return [self._row_to_session_item(row) for row in rows]
|
||||
|
||||
def sync_and_list(self, sessions: list[dict[str, Any]], project_id: int | None = None) -> list[dict[str, Any]]:
|
||||
self.sync_sessions(sessions)
|
||||
return self.list_cached_sessions(project_id)
|
||||
|
||||
def set_alias(self, session_key: str, alias: str) -> None:
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
clean_alias = alias.strip()
|
||||
with self._connect() as conn:
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO session_cache (session_key, created_at, updated_at, alias, last_seen_at)
|
||||
VALUES (?, '', '', ?, ?)
|
||||
ON CONFLICT(session_key) DO UPDATE SET
|
||||
alias = excluded.alias,
|
||||
last_seen_at = excluded.last_seen_at
|
||||
""",
|
||||
(session_key, clean_alias, now),
|
||||
)
|
||||
|
||||
def update_alias_meta(
|
||||
self,
|
||||
session_key: str,
|
||||
alias: str | None = None,
|
||||
pinned: bool | None = None,
|
||||
archived: bool | None = None,
|
||||
project_id: int | None = None,
|
||||
) -> dict[str, Any]:
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
with self._connect() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT alias, pinned, archived, project_id FROM session_cache WHERE session_key = ?",
|
||||
(session_key,),
|
||||
).fetchone()
|
||||
current_alias = (str(row["alias"]) if row and row["alias"] else "")
|
||||
current_pinned = bool(row["pinned"]) if row else False
|
||||
current_archived = bool(row["archived"]) if row else False
|
||||
current_project_id = row["project_id"] if row and "project_id" in row.keys() else None
|
||||
next_alias = current_alias if alias is None else alias.strip()
|
||||
next_pinned = current_pinned if pinned is None else bool(pinned)
|
||||
next_archived = current_archived if archived is None else bool(archived)
|
||||
next_project_id = current_project_id if project_id is None else project_id
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO session_cache (session_key, created_at, updated_at, alias, pinned, archived, project_id, last_seen_at)
|
||||
VALUES (?, '', '', ?, ?, ?, ?, ?)
|
||||
ON CONFLICT(session_key) DO UPDATE SET
|
||||
alias = excluded.alias,
|
||||
pinned = excluded.pinned,
|
||||
archived = excluded.archived,
|
||||
project_id = excluded.project_id,
|
||||
last_seen_at = excluded.last_seen_at
|
||||
""",
|
||||
(session_key, next_alias, int(next_pinned), int(next_archived), next_project_id, now),
|
||||
)
|
||||
return {"alias": next_alias or None, "pinned": next_pinned, "archived": next_archived, "project_id": next_project_id}
|
||||
|
||||
def get_alias(self, session_key: str) -> str | None:
|
||||
with self._connect() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT alias FROM session_cache WHERE session_key = ?",
|
||||
(session_key,),
|
||||
).fetchone()
|
||||
if not row:
|
||||
return None
|
||||
alias = row["alias"]
|
||||
return str(alias) if alias else None
|
||||
|
||||
def get_alias_meta(self, session_key: str) -> dict[str, Any] | None:
|
||||
with self._connect() as conn:
|
||||
row = conn.execute(
|
||||
"SELECT alias, pinned, archived, project_id FROM session_cache WHERE session_key = ?",
|
||||
(session_key,),
|
||||
).fetchone()
|
||||
if not row:
|
||||
return None
|
||||
alias = (row["alias"] or "").strip()
|
||||
return {
|
||||
"alias": alias or None,
|
||||
"pinned": bool(row["pinned"]) if "pinned" in row.keys() else False,
|
||||
"archived": bool(row["archived"]) if "archived" in row.keys() else False,
|
||||
"project_id": row["project_id"] if "project_id" in row.keys() else None,
|
||||
}
|
||||
|
||||
def delete_session(self, session_key: str) -> None:
|
||||
with self._connect() as conn:
|
||||
conn.execute("DELETE FROM session_cache WHERE session_key = ?", (session_key,))
|
||||
|
||||
def _row_to_session_item(self, row: sqlite3.Row) -> dict[str, Any]:
|
||||
alias = (row["alias"] or "").strip()
|
||||
fallback = str(row["session_key"]).replace("api:", "")
|
||||
title = alias or fallback
|
||||
pinned = bool(row["pinned"]) if "pinned" in row.keys() else False
|
||||
archived = bool(row["archived"]) if "archived" in row.keys() else False
|
||||
project_id = row["project_id"] if "project_id" in row.keys() else None
|
||||
return {
|
||||
"key": row["session_key"],
|
||||
"created_at": row["created_at"],
|
||||
"updated_at": row["updated_at"],
|
||||
"metadata": {"title": title},
|
||||
"alias": alias or None,
|
||||
"pinned": pinned,
|
||||
"archived": archived,
|
||||
"project_id": project_id,
|
||||
}
|
||||
|
||||
|
||||
session_alias_store = SessionAliasStore()
|
||||
@@ -0,0 +1,27 @@
|
||||
import os
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from app.core.data_root import ensure_data_layout, get_data_root
|
||||
|
||||
# Ensure DATA_ROOT directory layout exists before SQLite engine init
|
||||
ensure_data_layout()
|
||||
|
||||
# Ensure dataclaw.db is created in the global data directory
|
||||
DB_PATH = get_data_root() / "dataclaw.db"
|
||||
SQLALCHEMY_DATABASE_URL = f"sqlite:///{DB_PATH}"
|
||||
|
||||
engine = create_engine(
|
||||
SQLALCHEMY_DATABASE_URL, connect_args={"check_same_thread": False}
|
||||
)
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
@@ -0,0 +1,227 @@
|
||||
from sqlalchemy import Boolean, Column, DateTime, ForeignKey, Integer, String, Text, JSON, Enum as SQLEnum, func
|
||||
from sqlalchemy.orm import relationship
|
||||
import enum
|
||||
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class A2ATaskState(str, enum.Enum):
|
||||
SUBMITTED = "SUBMITTED"
|
||||
WORKING = "WORKING"
|
||||
COMPLETED = "COMPLETED"
|
||||
FAILED = "FAILED"
|
||||
CANCELED = "CANCELED"
|
||||
INPUT_REQUIRED = "INPUT_REQUIRED"
|
||||
AUTH_REQUIRED = "AUTH_REQUIRED"
|
||||
REJECTED = "REJECTED"
|
||||
|
||||
|
||||
class A2APartType(str, enum.Enum):
|
||||
TEXT = "text"
|
||||
RAW = "raw"
|
||||
URL = "url"
|
||||
DATA = "data"
|
||||
|
||||
|
||||
class A2AMessageRole(str, enum.Enum):
|
||||
USER = "user"
|
||||
AGENT = "agent"
|
||||
SYSTEM = "system"
|
||||
|
||||
|
||||
class A2ARemoteAgent(Base):
|
||||
__tablename__ = "a2a_remote_agents"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
project_id = Column(Integer, ForeignKey("projects.id"), nullable=False, index=True)
|
||||
name = Column(String, nullable=False)
|
||||
base_url = Column(String, nullable=False)
|
||||
auth_scheme = Column(String, nullable=False, default="none")
|
||||
auth_token = Column(String, nullable=True)
|
||||
shared_secret = Column(String, nullable=True)
|
||||
mtls_ca_cert = Column(Text, nullable=True)
|
||||
mtls_client_cert = Column(Text, nullable=True)
|
||||
mtls_client_key = Column(Text, nullable=True)
|
||||
oauth2_client_id = Column(String, nullable=True)
|
||||
oauth2_client_secret = Column(String, nullable=True)
|
||||
oauth2_token_url = Column(String, nullable=True)
|
||||
oauth2_scopes = Column(String, nullable=True)
|
||||
oidc_issuer_url = Column(String, nullable=True)
|
||||
oidc_client_id = Column(String, nullable=True)
|
||||
oidc_client_secret = Column(String, nullable=True)
|
||||
protocol_version = Column(String, nullable=True)
|
||||
capabilities_json = Column(Text, nullable=False, default="[]")
|
||||
card_json = Column(Text, nullable=True)
|
||||
card_fetched_at = Column(DateTime, nullable=True)
|
||||
healthy = Column(Boolean, nullable=False, default=False)
|
||||
failure_count = Column(Integer, nullable=False, default=0)
|
||||
circuit_open_until = Column(DateTime, nullable=True)
|
||||
created_by = Column(Integer, ForeignKey("users.id"), nullable=False, index=True)
|
||||
created_at = Column(DateTime, default=func.now())
|
||||
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
|
||||
|
||||
project = relationship("Project")
|
||||
|
||||
|
||||
class A2APart(Base):
|
||||
__tablename__ = "a2a_parts"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
message_id = Column(Integer, ForeignKey("a2a_messages.id", ondelete="CASCADE"), nullable=True, index=True)
|
||||
artifact_id = Column(Integer, ForeignKey("a2a_artifacts.id", ondelete="CASCADE"), nullable=True, index=True)
|
||||
part_type = Column(SQLEnum(A2APartType), nullable=False)
|
||||
text_content = Column(Text, nullable=True)
|
||||
raw_content = Column(Text, nullable=True)
|
||||
url_content = Column(String, nullable=True)
|
||||
data_content = Column(Text, nullable=True)
|
||||
media_type = Column(String, nullable=True)
|
||||
filename = Column(String, nullable=True)
|
||||
metadata_json = Column(Text, nullable=False, default="{}")
|
||||
created_at = Column(DateTime, default=func.now())
|
||||
|
||||
message = relationship("A2AMessage", back_populates="parts", foreign_keys=[message_id])
|
||||
artifact = relationship("A2AArtifact", back_populates="parts", foreign_keys=[artifact_id])
|
||||
|
||||
|
||||
class A2AMessage(Base):
|
||||
__tablename__ = "a2a_messages"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
message_id = Column(String, nullable=False, unique=True, index=True)
|
||||
context_id = Column(String, nullable=True, index=True)
|
||||
task_id = Column(String, ForeignKey("a2a_tasks.id", ondelete="CASCADE"), nullable=True, index=True)
|
||||
role = Column(SQLEnum(A2AMessageRole), nullable=False)
|
||||
extensions_json = Column(Text, nullable=False, default="{}")
|
||||
reference_task_ids_json = Column(Text, nullable=False, default="[]")
|
||||
created_at = Column(DateTime, default=func.now(), index=True)
|
||||
|
||||
task = relationship("A2ATask", back_populates="messages", foreign_keys=[task_id])
|
||||
parts = relationship("A2APart", back_populates="message", cascade="all, delete-orphan")
|
||||
|
||||
|
||||
class A2AArtifact(Base):
|
||||
__tablename__ = "a2a_artifacts"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
artifact_id = Column(String, nullable=False, unique=True, index=True)
|
||||
task_id = Column(String, ForeignKey("a2a_tasks.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
name = Column(String, nullable=True)
|
||||
description = Column(Text, nullable=True)
|
||||
metadata_json = Column(Text, nullable=False, default="{}")
|
||||
extensions_json = Column(Text, nullable=False, default="{}")
|
||||
created_at = Column(DateTime, default=func.now())
|
||||
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
|
||||
|
||||
task = relationship("A2ATask", back_populates="artifacts")
|
||||
parts = relationship("A2APart", back_populates="artifact", cascade="all, delete-orphan")
|
||||
|
||||
|
||||
class A2ATask(Base):
|
||||
__tablename__ = "a2a_tasks"
|
||||
|
||||
id = Column(String, primary_key=True, index=True)
|
||||
project_id = Column(Integer, ForeignKey("projects.id"), nullable=False, index=True)
|
||||
tenant_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True)
|
||||
context_id = Column(String, nullable=True, index=True)
|
||||
source = Column(String, nullable=False, default="local")
|
||||
remote_agent_id = Column(Integer, ForeignKey("a2a_remote_agents.id"), nullable=True, index=True)
|
||||
idempotency_key = Column(String, nullable=True, index=True)
|
||||
state = Column(SQLEnum(A2ATaskState), nullable=False, index=True, default=A2ATaskState.SUBMITTED)
|
||||
input_text = Column(Text, nullable=False, default="")
|
||||
output_text = Column(Text, nullable=True)
|
||||
error_message = Column(Text, nullable=True)
|
||||
compatibility_mode = Column(Boolean, nullable=False, default=True)
|
||||
metadata_json = Column(Text, nullable=False, default="{}")
|
||||
history_length = Column(Integer, nullable=False, default=0)
|
||||
created_at = Column(DateTime, default=func.now())
|
||||
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
|
||||
finished_at = Column(DateTime, nullable=True)
|
||||
|
||||
project = relationship("Project")
|
||||
remote_agent = relationship("A2ARemoteAgent")
|
||||
messages = relationship("A2AMessage", back_populates="task", cascade="all, delete-orphan", foreign_keys=[A2AMessage.task_id])
|
||||
artifacts = relationship("A2AArtifact", back_populates="task", cascade="all, delete-orphan")
|
||||
|
||||
|
||||
class A2ATaskEvent(Base):
|
||||
__tablename__ = "a2a_task_events"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
task_id = Column(String, ForeignKey("a2a_tasks.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
event_type = Column(String, nullable=False)
|
||||
payload_json = Column(Text, nullable=False, default="{}")
|
||||
created_at = Column(DateTime, default=func.now(), index=True)
|
||||
|
||||
task = relationship("A2ATask")
|
||||
|
||||
|
||||
class A2ATaskWebhook(Base):
|
||||
__tablename__ = "a2a_task_webhooks"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
task_id = Column(String, ForeignKey("a2a_tasks.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
target_url = Column(String, nullable=False)
|
||||
secret = Column(String, nullable=True)
|
||||
auth_header = Column(String, nullable=True)
|
||||
enabled = Column(Boolean, nullable=False, default=True)
|
||||
created_by = Column(Integer, ForeignKey("users.id"), nullable=False, index=True)
|
||||
created_at = Column(DateTime, default=func.now())
|
||||
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
|
||||
|
||||
task = relationship("A2ATask")
|
||||
|
||||
|
||||
class A2AWebhookDelivery(Base):
|
||||
__tablename__ = "a2a_webhook_deliveries"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
task_id = Column(String, ForeignKey("a2a_tasks.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
webhook_id = Column(Integer, ForeignKey("a2a_task_webhooks.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
event_id = Column(Integer, ForeignKey("a2a_task_events.id", ondelete="CASCADE"), nullable=False, index=True)
|
||||
attempt = Column(Integer, nullable=False, default=0)
|
||||
status = Column(String, nullable=False, default="PENDING")
|
||||
response_code = Column(Integer, nullable=True)
|
||||
response_body = Column(Text, nullable=True)
|
||||
error_message = Column(Text, nullable=True)
|
||||
next_retry_at = Column(DateTime, nullable=True)
|
||||
delivered_at = Column(DateTime, nullable=True)
|
||||
dead_letter = Column(Boolean, nullable=False, default=False, index=True)
|
||||
created_at = Column(DateTime, default=func.now())
|
||||
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
|
||||
|
||||
task = relationship("A2ATask")
|
||||
webhook = relationship("A2ATaskWebhook")
|
||||
event = relationship("A2ATaskEvent")
|
||||
|
||||
|
||||
class A2AProjectConfig(Base):
|
||||
__tablename__ = "a2a_project_configs"
|
||||
|
||||
project_id = Column(Integer, ForeignKey("projects.id"), primary_key=True)
|
||||
canary_enabled = Column(Boolean, nullable=False, default=False)
|
||||
canary_percent = Column(Integer, nullable=False, default=0)
|
||||
rollback_to_local = Column(Boolean, nullable=False, default=True)
|
||||
compatibility_mode = Column(Boolean, nullable=False, default=True)
|
||||
dual_event_write = Column(Boolean, nullable=False, default=True)
|
||||
route_mode_default = Column(String, nullable=False, default="local_first")
|
||||
fallback_chain_json = Column(Text, nullable=False, default='["local"]')
|
||||
alert_thresholds_json = Column(Text, nullable=False, default="{}")
|
||||
updated_by = Column(Integer, ForeignKey("users.id"), nullable=False)
|
||||
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
|
||||
|
||||
project = relationship("Project")
|
||||
|
||||
|
||||
class A2AAuditLog(Base):
|
||||
__tablename__ = "a2a_audit_logs"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
actor_user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True)
|
||||
action = Column(String, nullable=False)
|
||||
target_type = Column(String, nullable=False)
|
||||
target_id = Column(String, nullable=False)
|
||||
project_id = Column(Integer, ForeignKey("projects.id"), nullable=True, index=True)
|
||||
task_id = Column(String, nullable=True, index=True)
|
||||
result = Column(String, nullable=False)
|
||||
detail_json = Column(Text, nullable=False, default="{}")
|
||||
created_at = Column(DateTime, default=func.now(), index=True)
|
||||
@@ -0,0 +1,16 @@
|
||||
from sqlalchemy import Column, Integer, String, JSON, DateTime, ForeignKey, func
|
||||
from sqlalchemy.orm import relationship
|
||||
from app.database import Base
|
||||
|
||||
class DataSource(Base):
|
||||
__tablename__ = "data_sources"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
name = Column(String, nullable=False)
|
||||
type = Column(String, nullable=False)
|
||||
config = Column(JSON, nullable=False)
|
||||
project_id = Column(Integer, ForeignKey("projects.id"), nullable=False)
|
||||
created_at = Column(DateTime, default=func.now())
|
||||
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
|
||||
|
||||
project = relationship("Project", back_populates="data_sources")
|
||||
@@ -0,0 +1,17 @@
|
||||
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, func
|
||||
from sqlalchemy.orm import relationship
|
||||
from app.database import Base
|
||||
|
||||
class Project(Base):
|
||||
__tablename__ = "projects"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
name = Column(String, nullable=False)
|
||||
description = Column(String, nullable=True)
|
||||
owner_id = Column(Integer, ForeignKey("users.id"), nullable=False)
|
||||
created_at = Column(DateTime, default=func.now())
|
||||
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
|
||||
|
||||
owner = relationship("User", back_populates="projects")
|
||||
data_sources = relationship("DataSource", back_populates="project", cascade="all, delete-orphan")
|
||||
subagents = relationship("Subagent", back_populates="project", cascade="all, delete-orphan")
|
||||
@@ -0,0 +1,17 @@
|
||||
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, func
|
||||
from sqlalchemy.orm import relationship
|
||||
from app.database import Base
|
||||
|
||||
class Subagent(Base):
|
||||
__tablename__ = "subagents"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
project_id = Column(Integer, ForeignKey("projects.id"), nullable=False)
|
||||
name = Column(String, nullable=False)
|
||||
description = Column(String, nullable=True)
|
||||
instructions = Column(String, nullable=True)
|
||||
model = Column(String, nullable=True)
|
||||
created_at = Column(DateTime, default=func.now())
|
||||
updated_at = Column(DateTime, default=func.now(), onupdate=func.now())
|
||||
|
||||
project = relationship("Project", back_populates="subagents")
|
||||
@@ -0,0 +1,31 @@
|
||||
from sqlalchemy import Column, Integer, String, Boolean, DateTime, ForeignKey
|
||||
from sqlalchemy.orm import relationship
|
||||
from sqlalchemy.sql import func
|
||||
from app.database import Base
|
||||
|
||||
class User(Base):
|
||||
__tablename__ = "users"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
username = Column(String, unique=True, index=True, nullable=False)
|
||||
email = Column(String, unique=True, index=True, nullable=False)
|
||||
hashed_password = Column(String, nullable=False)
|
||||
avatar = Column(String, nullable=True) # Store avatar identifier or URL
|
||||
is_active = Column(Boolean, default=True)
|
||||
is_admin = Column(Boolean, default=False)
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
|
||||
projects = relationship("Project", back_populates="owner")
|
||||
email_verifications = relationship("EmailVerification", back_populates="user", cascade="all, delete-orphan")
|
||||
|
||||
class EmailVerification(Base):
|
||||
__tablename__ = "email_verifications"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
user_id = Column(Integer, ForeignKey("users.id", ondelete="CASCADE"), nullable=False)
|
||||
token_hash = Column(String, index=True, nullable=False)
|
||||
expires_at = Column(DateTime(timezone=True), nullable=False)
|
||||
is_used = Column(Boolean, default=False)
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
|
||||
user = relationship("User", back_populates="email_verifications")
|
||||
@@ -0,0 +1,361 @@
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
from typing import Optional, List, Dict, Any, Literal, Union
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class A2ATaskState(str, Enum):
|
||||
SUBMITTED = "SUBMITTED"
|
||||
WORKING = "WORKING"
|
||||
COMPLETED = "COMPLETED"
|
||||
FAILED = "FAILED"
|
||||
CANCELED = "CANCELED"
|
||||
INPUT_REQUIRED = "INPUT_REQUIRED"
|
||||
AUTH_REQUIRED = "AUTH_REQUIRED"
|
||||
REJECTED = "REJECTED"
|
||||
|
||||
|
||||
class A2APartType(str, Enum):
|
||||
TEXT = "text"
|
||||
RAW = "raw"
|
||||
URL = "url"
|
||||
DATA = "data"
|
||||
|
||||
|
||||
class A2AMessageRole(str, Enum):
|
||||
USER = "user"
|
||||
AGENT = "agent"
|
||||
SYSTEM = "system"
|
||||
|
||||
|
||||
class A2APartSchema(BaseModel):
|
||||
part_type: A2APartType
|
||||
text: Optional[str] = None
|
||||
raw: Optional[bytes] = None
|
||||
url: Optional[str] = None
|
||||
data: Optional[Any] = None
|
||||
mediaType: Optional[str] = None
|
||||
filename: Optional[str] = None
|
||||
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class A2APartCreateSchema(BaseModel):
|
||||
part_type: A2APartType
|
||||
text: Optional[str] = None
|
||||
raw: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
data: Optional[Any] = None
|
||||
mediaType: Optional[str] = None
|
||||
filename: Optional[str] = None
|
||||
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class A2AMessageSchema(BaseModel):
|
||||
messageId: str
|
||||
contextId: Optional[str] = None
|
||||
taskId: Optional[str] = None
|
||||
role: A2AMessageRole
|
||||
parts: List[A2APartSchema] = Field(default_factory=list)
|
||||
extensions: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
referenceTaskIds: Optional[List[str]] = Field(default_factory=list)
|
||||
createdAt: Optional[datetime] = None
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class A2AMessageCreateSchema(BaseModel):
|
||||
messageId: str
|
||||
contextId: Optional[str] = None
|
||||
taskId: Optional[str] = None
|
||||
role: A2AMessageRole
|
||||
parts: List[A2APartCreateSchema] = Field(default_factory=list)
|
||||
extensions: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
referenceTaskIds: Optional[List[str]] = Field(default_factory=list)
|
||||
|
||||
|
||||
class A2AArtifactSchema(BaseModel):
|
||||
artifactId: str
|
||||
name: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
parts: List[A2APartSchema] = Field(default_factory=list)
|
||||
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
extensions: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
createdAt: Optional[datetime] = None
|
||||
updatedAt: Optional[datetime] = None
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class A2AArtifactCreateSchema(BaseModel):
|
||||
artifactId: str
|
||||
name: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
parts: List[A2APartCreateSchema] = Field(default_factory=list)
|
||||
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
extensions: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class A2ATaskStatusSchema(BaseModel):
|
||||
state: A2ATaskState
|
||||
timestamp: datetime
|
||||
|
||||
|
||||
class A2ATaskSchema(BaseModel):
|
||||
id: str
|
||||
contextId: Optional[str] = None
|
||||
projectId: int
|
||||
tenantId: int
|
||||
source: str
|
||||
remoteAgentId: Optional[int] = None
|
||||
idempotencyKey: Optional[str] = None
|
||||
state: A2ATaskState
|
||||
inputText: str
|
||||
outputText: Optional[str] = None
|
||||
errorMessage: Optional[str] = None
|
||||
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
historyLength: int = 0
|
||||
createdAt: datetime
|
||||
updatedAt: datetime
|
||||
finishedAt: Optional[datetime] = None
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class A2ATaskWithMessagesSchema(A2ATaskSchema):
|
||||
messages: List[A2AMessageSchema] = Field(default_factory=list)
|
||||
artifacts: List[A2AArtifactSchema] = Field(default_factory=list)
|
||||
|
||||
|
||||
class A2ATaskWithHistorySchema(BaseModel):
|
||||
id: str
|
||||
contextId: Optional[str] = None
|
||||
projectId: int
|
||||
tenantId: int
|
||||
state: A2ATaskState
|
||||
history: List[A2AMessageSchema] = Field(default_factory=list)
|
||||
artifacts: List[A2AArtifactSchema] = Field(default_factory=list)
|
||||
createdAt: datetime
|
||||
updatedAt: datetime
|
||||
finishedAt: Optional[datetime] = None
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class TaskStatusUpdateEvent(BaseModel):
|
||||
taskId: str
|
||||
contextId: Optional[str] = None
|
||||
status: A2ATaskStatusSchema
|
||||
metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class TaskArtifactUpdateEvent(BaseModel):
|
||||
taskId: str
|
||||
contextId: Optional[str] = None
|
||||
artifact: A2AArtifactSchema
|
||||
append: bool = False
|
||||
lastChunk: bool = True
|
||||
|
||||
|
||||
class TaskMessageEvent(BaseModel):
|
||||
message: A2AMessageSchema
|
||||
|
||||
|
||||
class StreamResponseTask(BaseModel):
|
||||
id: str
|
||||
contextId: Optional[str] = None
|
||||
state: A2ATaskState
|
||||
artifacts: List[A2AArtifactSchema] = Field(default_factory=list)
|
||||
|
||||
|
||||
class StreamResponse(BaseModel):
|
||||
task: Optional[StreamResponseTask] = None
|
||||
message: Optional[A2AMessageSchema] = None
|
||||
statusUpdate: Optional[TaskStatusUpdateEvent] = None
|
||||
artifactUpdate: Optional[TaskArtifactUpdateEvent] = None
|
||||
|
||||
|
||||
class SendMessageRequest(BaseModel):
|
||||
message: A2AMessageCreateSchema
|
||||
taskId: Optional[str] = None
|
||||
contextId: Optional[str] = None
|
||||
|
||||
|
||||
class SendStreamingMessageRequest(BaseModel):
|
||||
message: A2AMessageCreateSchema
|
||||
taskId: Optional[str] = None
|
||||
contextId: Optional[str] = None
|
||||
|
||||
|
||||
class GetTaskRequest(BaseModel):
|
||||
historyLength: Optional[int] = None
|
||||
|
||||
|
||||
class TaskListRequest(BaseModel):
|
||||
contextId: Optional[str] = None
|
||||
status: Optional[A2ATaskState] = None
|
||||
pageSize: int = 20
|
||||
pageToken: Optional[str] = None
|
||||
|
||||
|
||||
class CancelTaskRequest(BaseModel):
|
||||
pass
|
||||
|
||||
|
||||
class PushNotificationConfigCreate(BaseModel):
|
||||
targetUrl: str
|
||||
secret: Optional[str] = None
|
||||
authHeader: Optional[str] = None
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
class PushNotificationConfig(BaseModel):
|
||||
id: int
|
||||
taskId: str
|
||||
targetUrl: str
|
||||
secret: Optional[str] = None
|
||||
authHeader: Optional[str] = None
|
||||
enabled: bool
|
||||
createdBy: int
|
||||
createdAt: datetime
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
|
||||
class VersionNotSupportedError(BaseModel):
|
||||
code: int = -32009
|
||||
message: str = "Version not supported"
|
||||
data: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class AgentSkillInputMode(str, Enum):
|
||||
TEXT = "text"
|
||||
DATA = "data"
|
||||
RAW = "raw"
|
||||
URL = "url"
|
||||
|
||||
|
||||
class AgentSkillOutputMode(str, Enum):
|
||||
TEXT = "text"
|
||||
DATA = "data"
|
||||
ARTIFACT = "artifact"
|
||||
STREAM = "stream"
|
||||
|
||||
|
||||
class AgentSkillSecurityRequirement(BaseModel):
|
||||
scheme: str
|
||||
scopes: Optional[List[str]] = None
|
||||
|
||||
|
||||
class AgentSkillExample(BaseModel):
|
||||
input: Dict[str, Any]
|
||||
output: Dict[str, Any]
|
||||
|
||||
|
||||
class AgentSkill(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
tags: List[str] = Field(default_factory=list)
|
||||
examples: List[AgentSkillExample] = Field(default_factory=list)
|
||||
inputModes: List[AgentSkillInputMode] = Field(default_factory=list)
|
||||
outputModes: List[AgentSkillOutputMode] = Field(default_factory=list)
|
||||
securityRequirements: List[AgentSkillSecurityRequirement] = Field(default_factory=list)
|
||||
|
||||
|
||||
class AgentProvider(BaseModel):
|
||||
organization: str
|
||||
url: Optional[str] = None
|
||||
|
||||
|
||||
class AgentSupportedInterface(BaseModel):
|
||||
url: str
|
||||
protocolBinding: str
|
||||
protocolVersion: str
|
||||
tenant: Optional[str] = None
|
||||
|
||||
|
||||
class SecuritySchemeApiKey(BaseModel):
|
||||
type: Literal["apiKey"] = "apiKey"
|
||||
name: str
|
||||
in_: str = Field(alias="in")
|
||||
description: Optional[str] = None
|
||||
|
||||
model_config = ConfigDict(populate_by_name=True)
|
||||
|
||||
|
||||
class SecuritySchemeHttpAuth(BaseModel):
|
||||
type: Literal["http"] = "http"
|
||||
scheme: str
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class OAuth2AuthorizationCodeFlow(BaseModel):
|
||||
authorizationUrl: str
|
||||
tokenUrl: str
|
||||
scopes: Dict[str, str] = Field(default_factory=dict)
|
||||
refreshUrl: Optional[str] = None
|
||||
|
||||
|
||||
class OAuth2ClientCredentialsFlow(BaseModel):
|
||||
tokenUrl: str
|
||||
scopes: Dict[str, str] = Field(default_factory=dict)
|
||||
refreshUrl: Optional[str] = None
|
||||
|
||||
|
||||
class OAuth2DeviceCodeFlow(BaseModel):
|
||||
authorizationUrl: str
|
||||
tokenUrl: str
|
||||
scopes: Dict[str, str] = Field(default_factory=dict)
|
||||
deviceAuthorizationUrl: Optional[str] = None
|
||||
|
||||
|
||||
class OAuth2Flows(BaseModel):
|
||||
authorizationCode: Optional[OAuth2AuthorizationCodeFlow] = None
|
||||
clientCredentials: Optional[OAuth2ClientCredentialsFlow] = None
|
||||
deviceCode: Optional[OAuth2DeviceCodeFlow] = None
|
||||
implicit: Optional[Dict[str, Any]] = None
|
||||
password: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class SecuritySchemeOAuth2(BaseModel):
|
||||
type: Literal["oauth2"] = "oauth2"
|
||||
flows: OAuth2Flows
|
||||
description: Optional[str] = None
|
||||
|
||||
|
||||
class SecuritySchemeOpenIdConnect(BaseModel):
|
||||
type: Literal["openIdConnect"] = "openIdConnect"
|
||||
openIdConnectUrl: str
|
||||
description: Optional[str] = None
|
||||
scopes: Dict[str, str] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class SecuritySchemeMtls(BaseModel):
|
||||
type: Literal["mutualTLS"] = "mutualTLS"
|
||||
description: Optional[str] = None
|
||||
caCerts: Optional[List[str]] = None
|
||||
clientCert: Optional[str] = None
|
||||
clientKey: Optional[str] = None
|
||||
|
||||
|
||||
class AgentCardPublicSchema(BaseModel):
|
||||
name: str
|
||||
protocol_version: str = "1.0"
|
||||
capabilities: List[str]
|
||||
endpoints: Dict[str, str]
|
||||
auth: List[str]
|
||||
skills: List[AgentSkill] = Field(default_factory=list)
|
||||
provider: Optional[AgentProvider] = None
|
||||
supportedInterfaces: List[AgentSupportedInterface] = Field(default_factory=list)
|
||||
defaultInputModes: List[str] = Field(default_factory=list)
|
||||
defaultOutputModes: List[str] = Field(default_factory=list)
|
||||
iconUrl: Optional[str] = None
|
||||
documentationUrl: Optional[str] = None
|
||||
|
||||
|
||||
class AgentCardExtendedSchema(AgentCardPublicSchema):
|
||||
securitySchemes: Optional[Dict[str, Union[SecuritySchemeApiKey, SecuritySchemeHttpAuth, SecuritySchemeOAuth2, SecuritySchemeOpenIdConnect, SecuritySchemeMtls]]] = None
|
||||
security: List[Dict[str, List[str]]] = Field(default_factory=list)
|
||||
signatures: List[str] = Field(default_factory=list)
|
||||
tenantId: Optional[int] = None
|
||||
isAdmin: Optional[bool] = None
|
||||
@@ -0,0 +1,128 @@
|
||||
from typing import Any, Dict, List, Optional, Literal, Union
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# Base Chart Schema
|
||||
class ChartSchema(BaseModel):
|
||||
class ChartType(BaseModel):
|
||||
type: Literal["bar", "line", "area", "arc"]
|
||||
|
||||
class ChartEncoding(BaseModel):
|
||||
field: str
|
||||
type: Literal["ordinal", "quantitative", "nominal"]
|
||||
title: str
|
||||
|
||||
title: str
|
||||
mark: ChartType
|
||||
encoding: ChartEncoding
|
||||
|
||||
class TemporalChartEncoding(ChartSchema.ChartEncoding):
|
||||
type: Literal["temporal"] = Field(default="temporal")
|
||||
timeUnit: str = Field(default="yearmonth")
|
||||
|
||||
# Line Chart
|
||||
class LineChartSchema(ChartSchema):
|
||||
class LineChartMark(BaseModel):
|
||||
type: Literal["line"] = Field(default="line")
|
||||
|
||||
class LineChartEncoding(BaseModel):
|
||||
x: Union[TemporalChartEncoding, ChartSchema.ChartEncoding]
|
||||
y: ChartSchema.ChartEncoding
|
||||
color: Optional[ChartSchema.ChartEncoding] = None
|
||||
|
||||
mark: LineChartMark
|
||||
encoding: LineChartEncoding
|
||||
|
||||
# Multi Line Chart
|
||||
class MultiLineChartSchema(ChartSchema):
|
||||
class MultiLineChartMark(BaseModel):
|
||||
type: Literal["line"] = Field(default="line")
|
||||
|
||||
class MultiLineChartTransform(BaseModel):
|
||||
fold: List[str]
|
||||
as_: List[str] = Field(alias="as")
|
||||
|
||||
class MultiLineChartEncoding(BaseModel):
|
||||
x: Union[TemporalChartEncoding, ChartSchema.ChartEncoding]
|
||||
y: ChartSchema.ChartEncoding
|
||||
color: ChartSchema.ChartEncoding
|
||||
|
||||
mark: MultiLineChartMark
|
||||
transform: List[MultiLineChartTransform]
|
||||
encoding: MultiLineChartEncoding
|
||||
|
||||
# Bar Chart
|
||||
class BarChartSchema(ChartSchema):
|
||||
class BarChartMark(BaseModel):
|
||||
type: Literal["bar"] = Field(default="bar")
|
||||
|
||||
class BarChartEncoding(BaseModel):
|
||||
x: Union[TemporalChartEncoding, ChartSchema.ChartEncoding]
|
||||
y: ChartSchema.ChartEncoding
|
||||
color: Optional[ChartSchema.ChartEncoding] = None
|
||||
|
||||
mark: BarChartMark
|
||||
encoding: BarChartEncoding
|
||||
|
||||
# Grouped Bar Chart
|
||||
class GroupedBarChartSchema(ChartSchema):
|
||||
class GroupedBarChartMark(BaseModel):
|
||||
type: Literal["bar"] = Field(default="bar")
|
||||
|
||||
class GroupedBarChartEncoding(BaseModel):
|
||||
x: Union[TemporalChartEncoding, ChartSchema.ChartEncoding]
|
||||
y: ChartSchema.ChartEncoding
|
||||
xOffset: ChartSchema.ChartEncoding
|
||||
color: ChartSchema.ChartEncoding
|
||||
|
||||
mark: GroupedBarChartMark
|
||||
encoding: GroupedBarChartEncoding
|
||||
|
||||
# Stacked Bar Chart
|
||||
class StackedBarChartYEncoding(ChartSchema.ChartEncoding):
|
||||
stack: Literal["zero"] = Field(default="zero")
|
||||
|
||||
class StackedBarChartSchema(ChartSchema):
|
||||
class StackedBarChartMark(BaseModel):
|
||||
type: Literal["bar"] = Field(default="bar")
|
||||
|
||||
class StackedBarChartEncoding(BaseModel):
|
||||
x: Union[TemporalChartEncoding, ChartSchema.ChartEncoding]
|
||||
y: StackedBarChartYEncoding
|
||||
color: ChartSchema.ChartEncoding
|
||||
|
||||
mark: StackedBarChartMark
|
||||
encoding: StackedBarChartEncoding
|
||||
|
||||
# Pie Chart
|
||||
class PieChartSchema(ChartSchema):
|
||||
class PieChartMark(BaseModel):
|
||||
type: Literal["arc"] = Field(default="arc")
|
||||
|
||||
class PieChartEncoding(BaseModel):
|
||||
theta: ChartSchema.ChartEncoding
|
||||
color: ChartSchema.ChartEncoding
|
||||
|
||||
mark: PieChartMark
|
||||
encoding: PieChartEncoding
|
||||
|
||||
# Area Chart
|
||||
class AreaChartSchema(ChartSchema):
|
||||
class AreaChartMark(BaseModel):
|
||||
type: Literal["area"] = Field(default="area")
|
||||
|
||||
class AreaChartEncoding(BaseModel):
|
||||
x: Union[TemporalChartEncoding, ChartSchema.ChartEncoding]
|
||||
y: ChartSchema.ChartEncoding
|
||||
|
||||
mark: AreaChartMark
|
||||
encoding: AreaChartEncoding
|
||||
|
||||
# Response Model
|
||||
class ChartGenerationResponse(BaseModel):
|
||||
reasoning: str = Field(..., description="Reasoning for the chart choice or why a chart cannot be generated")
|
||||
chart_type: Literal[
|
||||
"line", "multi_line", "bar", "pie", "grouped_bar", "stacked_bar", "area", ""
|
||||
] = Field(..., description="The type of chart generated, or empty string if none")
|
||||
# Using Dict[str, Any] allows LLM to output valid Vega-Lite spec directly, avoiding Pydantic strict model serialization issues with dynamic fields
|
||||
chart_spec: Optional[Dict[str, Any]] = Field(None, description="The generated Vega-Lite chart specification")
|
||||
can_visualize: bool = Field(..., description="Whether the data can be visualized")
|
||||
@@ -0,0 +1,29 @@
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Dict, Any, Optional
|
||||
from datetime import datetime
|
||||
|
||||
class DataSourceBase(BaseModel):
|
||||
name: str
|
||||
type: str # sqlite, postgres, clickhouse, supabase, parquet
|
||||
config: Dict[str, Any]
|
||||
project_id: int
|
||||
|
||||
class DataSourceCreate(DataSourceBase):
|
||||
pass
|
||||
|
||||
class DataSourceUpdate(BaseModel):
|
||||
name: Optional[str] = None
|
||||
type: Optional[str] = None
|
||||
config: Optional[Dict[str, Any]] = None
|
||||
|
||||
class DataSource(DataSourceBase):
|
||||
id: int
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
class DataSourceTestRequest(BaseModel):
|
||||
type: str
|
||||
config: Dict[str, Any]
|
||||
@@ -0,0 +1,28 @@
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
class EmbeddingModelConfigBase(BaseModel):
|
||||
name: str = Field(..., description="Display name for the model configuration")
|
||||
provider: str = Field("openai", description="Provider type (e.g. openai)")
|
||||
model: str = Field(..., description="Model name (e.g. text-embedding-3-small)")
|
||||
api_base: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
|
||||
class EmbeddingModelConfigCreate(EmbeddingModelConfigBase):
|
||||
pass
|
||||
|
||||
class EmbeddingModelConfigUpdate(BaseModel):
|
||||
name: Optional[str] = None
|
||||
provider: Optional[str] = None
|
||||
model: Optional[str] = None
|
||||
api_base: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
|
||||
class EmbeddingModelConfig(EmbeddingModelConfigBase):
|
||||
id: str
|
||||
|
||||
class EmbeddingModelConnectionTestRequest(BaseModel):
|
||||
provider: str = Field("openai")
|
||||
model: str = Field(...)
|
||||
api_base: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
@@ -0,0 +1,162 @@
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from pydantic import BaseModel, Field, field_validator
|
||||
|
||||
|
||||
class KnowledgeDocumentBase(BaseModel):
|
||||
title: str = Field(..., min_length=1, max_length=200)
|
||||
content: str = Field(..., min_length=1)
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class KnowledgeDocumentCreate(KnowledgeDocumentBase):
|
||||
pass
|
||||
|
||||
|
||||
class KnowledgeDocumentUpdate(BaseModel):
|
||||
title: Optional[str] = Field(None, min_length=1, max_length=200)
|
||||
content: Optional[str] = Field(None, min_length=1)
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class KnowledgeDocument(KnowledgeDocumentBase):
|
||||
id: str
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
|
||||
class KnowledgeBaseConfigBase(BaseModel):
|
||||
name: str = Field(..., min_length=1, max_length=120)
|
||||
description: Optional[str] = None
|
||||
project_id: Optional[int] = None
|
||||
embedding_model: Optional[str] = None
|
||||
chunk_size: int = Field(default=512, ge=64, le=4096)
|
||||
chunk_overlap: int = Field(default=50, ge=0, le=512)
|
||||
top_k: int = Field(default=3, ge=1, le=20)
|
||||
is_active: bool = True
|
||||
|
||||
|
||||
class KnowledgeBaseCreate(KnowledgeBaseConfigBase):
|
||||
pass
|
||||
|
||||
|
||||
class KnowledgeBaseUpdate(BaseModel):
|
||||
name: Optional[str] = Field(None, min_length=1, max_length=120)
|
||||
description: Optional[str] = None
|
||||
project_id: Optional[int] = None
|
||||
embedding_model: Optional[str] = None
|
||||
chunk_size: Optional[int] = Field(None, ge=64, le=4096)
|
||||
chunk_overlap: Optional[int] = Field(None, ge=0, le=512)
|
||||
top_k: Optional[int] = Field(None, ge=1, le=20)
|
||||
is_active: Optional[bool] = None
|
||||
|
||||
|
||||
class KnowledgeBase(KnowledgeBaseConfigBase):
|
||||
id: str
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
documents: List[KnowledgeDocument] = Field(default_factory=list)
|
||||
|
||||
|
||||
class KnowledgeSearchRequest(BaseModel):
|
||||
query: str = Field(..., min_length=1)
|
||||
top_k: Optional[int] = Field(default=None, ge=1, le=20)
|
||||
|
||||
|
||||
class KnowledgeSearchHit(BaseModel):
|
||||
doc_id: str
|
||||
title: str
|
||||
chunk: str
|
||||
score: float
|
||||
metadata: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
|
||||
class KnowledgeSearchResponse(BaseModel):
|
||||
answer: str
|
||||
hits: List[KnowledgeSearchHit] = Field(default_factory=list)
|
||||
|
||||
|
||||
class KnowledgeGlobalConfigUpdate(BaseModel):
|
||||
api_base: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
default_embedding_model: Optional[str] = None
|
||||
|
||||
@field_validator("api_base")
|
||||
@classmethod
|
||||
def validate_api_base(cls, value: Optional[str]) -> Optional[str]:
|
||||
if value is None:
|
||||
return None
|
||||
normalized = value.strip()
|
||||
if not normalized:
|
||||
return None
|
||||
if not (normalized.startswith("http://") or normalized.startswith("https://")):
|
||||
raise ValueError("api_base must start with http:// or https://")
|
||||
return normalized.rstrip("/")
|
||||
|
||||
@field_validator("api_key")
|
||||
@classmethod
|
||||
def validate_api_key(cls, value: Optional[str]) -> Optional[str]:
|
||||
if value is None:
|
||||
return None
|
||||
normalized = value.strip()
|
||||
if not normalized:
|
||||
return None
|
||||
if len(normalized) > 512:
|
||||
raise ValueError("api_key is too long")
|
||||
return normalized
|
||||
|
||||
@field_validator("default_embedding_model")
|
||||
@classmethod
|
||||
def validate_default_embedding_model(cls, value: Optional[str]) -> Optional[str]:
|
||||
if value is None:
|
||||
return None
|
||||
normalized = value.strip()
|
||||
if not normalized:
|
||||
return None
|
||||
if len(normalized) > 200:
|
||||
raise ValueError("default_embedding_model is too long")
|
||||
return normalized
|
||||
|
||||
|
||||
class KnowledgeGlobalConfig(BaseModel):
|
||||
api_base: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
api_key_masked: Optional[str] = None
|
||||
has_api_key: bool = False
|
||||
default_embedding_model: Optional[str] = None
|
||||
|
||||
|
||||
class KnowledgeConnectionTestRequest(BaseModel):
|
||||
api_base: Optional[str] = None
|
||||
api_key: Optional[str] = None
|
||||
model_name: Optional[str] = None
|
||||
|
||||
@field_validator("api_base")
|
||||
@classmethod
|
||||
def validate_test_api_base(cls, value: Optional[str]) -> Optional[str]:
|
||||
if value is None:
|
||||
return None
|
||||
normalized = value.strip()
|
||||
if not normalized:
|
||||
return None
|
||||
if not (normalized.startswith("http://") or normalized.startswith("https://")):
|
||||
raise ValueError("api_base must start with http:// or https://")
|
||||
return normalized.rstrip("/")
|
||||
|
||||
@field_validator("api_key", "model_name")
|
||||
@classmethod
|
||||
def normalize_test_value(cls, value: Optional[str]) -> Optional[str]:
|
||||
if value is None:
|
||||
return None
|
||||
normalized = value.strip()
|
||||
return normalized or None
|
||||
|
||||
|
||||
class KnowledgeConnectionTestResponse(BaseModel):
|
||||
success: bool
|
||||
message: str
|
||||
model_name: Optional[str] = None
|
||||
embedding_dimension: Optional[int] = None
|
||||
resolved_api_base: Optional[str] = None
|
||||
available_models: List[str] = Field(default_factory=list)
|
||||
@@ -0,0 +1,30 @@
|
||||
from typing import List, Dict, Optional, Literal
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
class MCPServerBase(BaseModel):
|
||||
name: str
|
||||
type: Literal["stdio", "sse", "streamableHttp"]
|
||||
command: Optional[str] = None
|
||||
args: Optional[List[str]] = Field(default_factory=list)
|
||||
env: Optional[Dict[str, str]] = Field(default_factory=dict)
|
||||
url: Optional[str] = None
|
||||
headers: Optional[Dict[str, str]] = Field(default_factory=dict)
|
||||
project_id: int
|
||||
status: str = "disconnected"
|
||||
|
||||
class MCPServerCreate(MCPServerBase):
|
||||
pass
|
||||
|
||||
class MCPServerUpdate(BaseModel):
|
||||
name: Optional[str] = None
|
||||
type: Optional[Literal["stdio", "sse", "streamableHttp"]] = None
|
||||
command: Optional[str] = None
|
||||
args: Optional[List[str]] = None
|
||||
env: Optional[Dict[str, str]] = None
|
||||
url: Optional[str] = None
|
||||
headers: Optional[Dict[str, str]] = None
|
||||
project_id: Optional[int] = None
|
||||
status: Optional[str] = None
|
||||
|
||||
class MCPServer(MCPServerBase):
|
||||
id: str
|
||||
@@ -0,0 +1,115 @@
|
||||
from typing import List, Optional, Dict, Any, Union, Literal
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# Common Types
|
||||
AccessControlOperator = Literal[
|
||||
"EQUALS", "NOT_EQUALS", "GREATER_THAN", "LESS_THAN",
|
||||
"GREATER_THAN_OR_EQUALS", "LESS_THAN_OR_EQUALS"
|
||||
]
|
||||
|
||||
JoinType = Literal["ONE_TO_ONE", "ONE_TO_MANY", "MANY_TO_ONE", "MANY_TO_MANY"]
|
||||
|
||||
# Column Definitions
|
||||
class SessionProperty(BaseModel):
|
||||
name: str
|
||||
required: bool
|
||||
defaultExpr: Optional[str] = None
|
||||
|
||||
class AccessControlThreshold(BaseModel):
|
||||
value: str
|
||||
dataType: Literal["NUMERIC", "STRING"]
|
||||
|
||||
class ColumnAccessControl(BaseModel):
|
||||
name: str
|
||||
operator: AccessControlOperator
|
||||
requiredProperties: List[SessionProperty]
|
||||
threshold: Optional[AccessControlThreshold] = None
|
||||
|
||||
class Column(BaseModel):
|
||||
name: str
|
||||
type: str
|
||||
relationship: Optional[str] = None
|
||||
isCalculated: bool = False
|
||||
notNull: bool = False
|
||||
expression: Optional[str] = None
|
||||
isHidden: bool = False
|
||||
columnLevelAccessControl: Optional[ColumnAccessControl] = None
|
||||
properties: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Model Definitions
|
||||
class TableReference(BaseModel):
|
||||
catalog: Optional[str] = None
|
||||
schema_: Optional[str] = Field(None, alias="schema")
|
||||
table: str
|
||||
|
||||
class RowLevelAccessControl(BaseModel):
|
||||
name: str
|
||||
requiredProperties: List[SessionProperty]
|
||||
condition: str
|
||||
|
||||
class Model(BaseModel):
|
||||
name: str
|
||||
tableReference: Optional[TableReference] = None
|
||||
refSql: Optional[str] = None
|
||||
baseObject: Optional[str] = None
|
||||
columns: List[Column] = Field(default_factory=list)
|
||||
primaryKey: Optional[str] = None
|
||||
cached: bool = False
|
||||
refreshTime: Optional[str] = None
|
||||
rowLevelAccessControls: List[RowLevelAccessControl] = Field(default_factory=list)
|
||||
properties: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Relationship Definitions
|
||||
class Relationship(BaseModel):
|
||||
name: str
|
||||
models: List[str] # minItems: 2, maxItems: 2
|
||||
joinType: JoinType
|
||||
condition: str
|
||||
properties: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Metric Definitions
|
||||
class MetricTimeGrain(BaseModel):
|
||||
name: str
|
||||
refColumn: str
|
||||
dateParts: List[str]
|
||||
|
||||
class Metric(BaseModel):
|
||||
name: str
|
||||
baseObject: str
|
||||
dimension: List[Column] = Field(default_factory=list)
|
||||
measure: List[Column] = Field(default_factory=list)
|
||||
timeGrain: List[MetricTimeGrain] = Field(default_factory=list)
|
||||
cached: bool = False
|
||||
refreshTime: Optional[str] = None
|
||||
properties: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# View Definitions
|
||||
class View(BaseModel):
|
||||
name: str
|
||||
statement: str
|
||||
properties: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Enum Definitions
|
||||
class EnumValue(BaseModel):
|
||||
name: str
|
||||
value: Optional[str] = None
|
||||
properties: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
class EnumDefinition(BaseModel):
|
||||
name: str
|
||||
values: List[EnumValue]
|
||||
properties: Dict[str, Any] = Field(default_factory=dict)
|
||||
|
||||
# Main Manifest
|
||||
class MDLManifest(BaseModel):
|
||||
catalog: str
|
||||
schema_: str = Field(..., alias="schema") # 'schema' is a reserved word in Pydantic v1/Python, aliasing
|
||||
dataSource: Optional[str] = None
|
||||
models: List[Model] = Field(default_factory=list)
|
||||
relationships: List[Relationship] = Field(default_factory=list)
|
||||
metrics: List[Metric] = Field(default_factory=list)
|
||||
views: List[View] = Field(default_factory=list)
|
||||
enumDefinitions: List[EnumDefinition] = Field(default_factory=list)
|
||||
|
||||
class Config:
|
||||
populate_by_name = True
|
||||
@@ -0,0 +1,23 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, List
|
||||
from datetime import datetime
|
||||
|
||||
class ProjectBase(BaseModel):
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
|
||||
class ProjectCreate(ProjectBase):
|
||||
pass
|
||||
|
||||
class ProjectUpdate(BaseModel):
|
||||
name: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
|
||||
class Project(ProjectBase):
|
||||
id: int
|
||||
owner_id: int
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
@@ -0,0 +1,27 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
class SubagentBase(BaseModel):
|
||||
name: str
|
||||
description: Optional[str] = None
|
||||
instructions: Optional[str] = None
|
||||
model: Optional[str] = None
|
||||
|
||||
class SubagentCreate(SubagentBase):
|
||||
pass
|
||||
|
||||
class SubagentUpdate(BaseModel):
|
||||
name: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
instructions: Optional[str] = None
|
||||
model: Optional[str] = None
|
||||
|
||||
class Subagent(SubagentBase):
|
||||
id: int
|
||||
project_id: int
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
@@ -0,0 +1,30 @@
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
class UserBase(BaseModel):
|
||||
username: str
|
||||
email: str
|
||||
avatar: Optional[str] = None
|
||||
is_active: Optional[bool] = True
|
||||
is_admin: Optional[bool] = False
|
||||
|
||||
class UserCreate(UserBase):
|
||||
password: str
|
||||
|
||||
class UserUpdate(BaseModel):
|
||||
username: Optional[str] = None
|
||||
email: Optional[str] = None
|
||||
avatar: Optional[str] = None
|
||||
is_active: Optional[bool] = None
|
||||
is_admin: Optional[bool] = None
|
||||
password: Optional[str] = None
|
||||
|
||||
class ResendVerificationRequest(BaseModel):
|
||||
username: str
|
||||
|
||||
class UserResponse(UserBase):
|
||||
id: int
|
||||
created_at: datetime
|
||||
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
@@ -0,0 +1,704 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import ssl
|
||||
import time
|
||||
import uuid
|
||||
from collections import defaultdict, deque
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any, AsyncIterator, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
import httpx
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models.a2a import (
|
||||
A2AAuditLog,
|
||||
A2AProjectConfig,
|
||||
A2ARemoteAgent,
|
||||
A2ATask,
|
||||
A2ATaskEvent,
|
||||
A2ATaskWebhook,
|
||||
A2AWebhookDelivery,
|
||||
)
|
||||
from app.models.project import Project
|
||||
from app.schemas.a2a import (
|
||||
A2AArtifactSchema,
|
||||
A2APartSchema,
|
||||
A2ATaskStatusSchema,
|
||||
TaskArtifactUpdateEvent,
|
||||
TaskStatusUpdateEvent,
|
||||
)
|
||||
from app.trace import build_error_attributes, trace_service
|
||||
|
||||
|
||||
def _json_loads(raw: Optional[str], default: Any) -> Any:
|
||||
if not raw:
|
||||
return default
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except Exception:
|
||||
return default
|
||||
|
||||
|
||||
def _json_dumps(raw: Any) -> str:
|
||||
return json.dumps(raw, ensure_ascii=False)
|
||||
|
||||
|
||||
def _utc_now() -> datetime:
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
def _mask_error(message: str) -> str:
|
||||
if not message:
|
||||
return "internal_error"
|
||||
return "request_failed"
|
||||
|
||||
|
||||
class SharedSecretAuth:
|
||||
@staticmethod
|
||||
def generate_signature(secret: str, payload: bytes, timestamp: Optional[int] = None) -> Tuple[str, int]:
|
||||
if timestamp is None:
|
||||
timestamp = int(time.time())
|
||||
message = f"{timestamp}".encode() + payload
|
||||
signature = hmac.new(secret.encode(), message, hashlib.sha256).hexdigest()
|
||||
return f"sha256={signature}", timestamp
|
||||
|
||||
@staticmethod
|
||||
def verify_signature(secret: str, payload: bytes, signature: str, timestamp: int, max_age_seconds: int = 300) -> bool:
|
||||
if abs(time.time() - timestamp) > max_age_seconds:
|
||||
return False
|
||||
expected_sig, _ = SharedSecretAuth.generate_signature(secret, payload, timestamp)
|
||||
return hmac.compare_digest(signature, expected_sig)
|
||||
|
||||
@staticmethod
|
||||
def sign_request(secret: str, method: str, path: str, body: Optional[bytes] = None) -> Dict[str, str]:
|
||||
timestamp = int(time.time())
|
||||
payload = body or b""
|
||||
message = f"{timestamp}.{method.upper()}.{path}".encode() + payload
|
||||
signature = hmac.new(secret.encode(), message, hashlib.sha256).hexdigest()
|
||||
return {
|
||||
"X-A2A-Signature": f"sha256={signature}",
|
||||
"X-A2A-Timestamp": str(timestamp),
|
||||
}
|
||||
|
||||
|
||||
class MtlsConfig:
|
||||
def __init__(
|
||||
self,
|
||||
ca_cert: Optional[str] = None,
|
||||
client_cert: Optional[str] = None,
|
||||
client_key: Optional[str] = None,
|
||||
):
|
||||
self.ca_cert = ca_cert
|
||||
self.client_cert = client_cert
|
||||
self.client_key = client_key
|
||||
|
||||
def create_ssl_context(self) -> Optional[ssl.SSLContext]:
|
||||
if not self.client_cert or not self.client_key:
|
||||
return None
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ctx.load_cert_chain(self.client_cert, self.client_key)
|
||||
if self.ca_cert:
|
||||
ctx.load_verify_locations(self.ca_cert)
|
||||
ctx.verify_mode = ssl.CERT_REQUIRED
|
||||
else:
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
return ctx
|
||||
|
||||
|
||||
class OAuth2TokenStore:
|
||||
def __init__(self):
|
||||
self._tokens: Dict[str, Tuple[str, datetime]] = {}
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
async def get_token(self, key: str) -> Optional[str]:
|
||||
async with self._lock:
|
||||
if key in self._tokens:
|
||||
token, expires_at = self._tokens[key]
|
||||
if expires_at > _utc_now() + timedelta(minutes=1):
|
||||
return token
|
||||
return None
|
||||
|
||||
async def set_token(self, key: str, token: str, expires_in: int = 3600) -> None:
|
||||
async with self._lock:
|
||||
self._tokens[key] = (token, _utc_now() + timedelta(seconds=expires_in))
|
||||
|
||||
|
||||
class OAuth2Auth:
|
||||
def __init__(
|
||||
self,
|
||||
client_id: str,
|
||||
client_secret: str,
|
||||
token_url: str,
|
||||
scopes: Optional[List[str]] = None,
|
||||
):
|
||||
self.client_id = client_id
|
||||
self.client_secret = client_secret
|
||||
self.token_url = token_url
|
||||
self.scopes = scopes or []
|
||||
self._token_store = OAuth2TokenStore()
|
||||
|
||||
def _get_cache_key(self) -> str:
|
||||
return f"{self.client_id}:{self.token_url}:{':'.join(self.scopes)}"
|
||||
|
||||
async def get_access_token(self, grant_type: str = "client_credentials") -> str:
|
||||
cache_key = self._get_cache_key()
|
||||
cached = await self._token_store.get_token(cache_key)
|
||||
if cached:
|
||||
return cached
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
data = {
|
||||
"client_id": self.client_id,
|
||||
"client_secret": self.client_secret,
|
||||
"grant_type": grant_type,
|
||||
}
|
||||
if self.scopes:
|
||||
data["scope"] = " ".join(self.scopes)
|
||||
resp = await client.post(self.token_url, data=data)
|
||||
resp.raise_for_status()
|
||||
token_data = resp.json()
|
||||
token = token_data["access_token"]
|
||||
expires_in = token_data.get("expires_in", 3600)
|
||||
await self._token_store.set_token(cache_key, token, expires_in)
|
||||
return token
|
||||
|
||||
async def authorize_request(self, method: str, url: str, **kwargs) -> Dict[str, str]:
|
||||
token = await self.get_access_token()
|
||||
return {"Authorization": f"Bearer {token}"}
|
||||
|
||||
|
||||
class OIDCAuth:
|
||||
def __init__(
|
||||
self,
|
||||
issuer_url: str,
|
||||
client_id: str,
|
||||
client_secret: Optional[str] = None,
|
||||
scopes: Optional[List[str]] = None,
|
||||
):
|
||||
self.issuer_url = issuer_url.rstrip("/")
|
||||
self.client_id = client_id
|
||||
self.client_secret = client_secret
|
||||
self.scopes = scopes or ["openid", "profile"]
|
||||
self._oauth2: Optional[OAuth2Auth] = None
|
||||
self._discovery_cache: Optional[Dict[str, Any]] = None
|
||||
|
||||
async def _get_discovery(self) -> Dict[str, Any]:
|
||||
if self._discovery_cache:
|
||||
return self._discovery_cache
|
||||
discovery_url = f"{self.issuer_url}/.well-known/openid-configuration"
|
||||
async with httpx.AsyncClient() as client:
|
||||
resp = await client.get(discovery_url)
|
||||
resp.raise_for_status()
|
||||
self._discovery_cache = resp.json()
|
||||
return self._discovery_cache
|
||||
|
||||
async def get_access_token(self) -> str:
|
||||
discovery = await self._get_discovery()
|
||||
token_url = discovery.get("token_endpoint")
|
||||
if not token_url:
|
||||
raise RuntimeError("OIDC discovery missing token_endpoint")
|
||||
if not self._oauth2:
|
||||
self._oauth2 = OAuth2Auth(
|
||||
client_id=self.client_id,
|
||||
client_secret=self.client_secret or "",
|
||||
token_url=token_url,
|
||||
scopes=self.scopes,
|
||||
)
|
||||
return await self._oauth2.get_access_token()
|
||||
|
||||
async def authorize_request(self, method: str, url: str, **kwargs) -> Dict[str, str]:
|
||||
token = await self.get_access_token()
|
||||
return {"Authorization": f"Bearer {token}"}
|
||||
|
||||
|
||||
class RemoteAgentSecuritySelector:
|
||||
def __init__(self, agent: A2ARemoteAgent):
|
||||
self.agent = agent
|
||||
self._card_security_schemes: Optional[Dict[str, Any]] = None
|
||||
|
||||
def load_security_from_card(self) -> None:
|
||||
card = _json_loads(self.agent.card_json, {})
|
||||
if card:
|
||||
self._card_security_schemes = card.get("securitySchemes", {})
|
||||
|
||||
def get_preferred_auth_scheme(self) -> str:
|
||||
card = _json_loads(self.agent.card_json, {})
|
||||
security_reqs = card.get("security", [])
|
||||
if security_reqs:
|
||||
first_req = security_reqs[0]
|
||||
if isinstance(first_req, dict):
|
||||
for scheme_name in first_req.keys():
|
||||
return scheme_name
|
||||
return self.agent.auth_scheme or "bearer"
|
||||
|
||||
def get_auth_headers(self, user_token: Optional[str] = None) -> Dict[str, str]:
|
||||
headers: Dict[str, str] = {}
|
||||
preferred = self.get_preferred_auth_scheme()
|
||||
|
||||
if preferred == "bearer" or self.agent.auth_scheme == "bearer":
|
||||
if self.agent.auth_token:
|
||||
headers["Authorization"] = f"Bearer {self.agent.auth_token}"
|
||||
elif user_token:
|
||||
headers["Authorization"] = f"Bearer {user_token}"
|
||||
|
||||
elif preferred == "shared_secret" or self.agent.auth_scheme == "shared_secret":
|
||||
pass
|
||||
|
||||
elif preferred in ("oauth2", "oauth2_authorizationcode", "oauth2_clientcredentials"):
|
||||
pass
|
||||
|
||||
elif preferred == "openIdConnect":
|
||||
pass
|
||||
|
||||
elif preferred == "mutualTLS":
|
||||
pass
|
||||
|
||||
return headers
|
||||
|
||||
def get_mtls_context(self) -> Optional[ssl.SSLContext]:
|
||||
if self.agent.auth_scheme == "mutualTLS" or self.get_preferred_auth_scheme() == "mutualTLS":
|
||||
if self.agent.mtls_client_cert and self.agent.mtls_client_key:
|
||||
config = MtlsConfig(
|
||||
ca_cert=self.agent.mtls_ca_cert,
|
||||
client_cert=self.agent.mtls_client_cert,
|
||||
client_key=self.agent.mtls_client_key,
|
||||
)
|
||||
return config.create_ssl_context()
|
||||
return None
|
||||
|
||||
def create_signed_request_headers(
|
||||
self,
|
||||
method: str,
|
||||
path: str,
|
||||
body: Optional[bytes] = None,
|
||||
) -> Dict[str, str]:
|
||||
headers: Dict[str, str] = {}
|
||||
preferred = self.get_preferred_auth_scheme()
|
||||
|
||||
if preferred == "shared_secret" and self.agent.shared_secret:
|
||||
sig_headers = SharedSecretAuth.sign_request(
|
||||
self.agent.shared_secret,
|
||||
method,
|
||||
path,
|
||||
body,
|
||||
)
|
||||
headers.update(sig_headers)
|
||||
|
||||
elif self.agent.auth_scheme == "bearer" and self.agent.auth_token:
|
||||
headers["Authorization"] = f"Bearer {self.agent.auth_token}"
|
||||
|
||||
return headers
|
||||
|
||||
async def get_oauth2_auth(self) -> Optional[OAuth2Auth]:
|
||||
if self.agent.oauth2_client_id and self.agent.oauth2_token_url:
|
||||
scopes = self.agent.oauth2_scopes.split() if self.agent.oauth2_scopes else []
|
||||
return OAuth2Auth(
|
||||
client_id=self.agent.oauth2_client_id,
|
||||
client_secret=self.agent.oauth2_client_secret or "",
|
||||
token_url=self.agent.oauth2_token_url,
|
||||
scopes=scopes,
|
||||
)
|
||||
return None
|
||||
|
||||
async def get_oidc_auth(self) -> Optional[OIDCAuth]:
|
||||
if self.agent.oidc_issuer_url:
|
||||
return OIDCAuth(
|
||||
issuer_url=self.agent.oidc_issuer_url,
|
||||
client_id=self.agent.oidc_client_id or "",
|
||||
client_secret=self.agent.oidc_client_secret,
|
||||
scopes=self.agent.oauth2_scopes.split() if self.agent.oauth2_scopes else [],
|
||||
)
|
||||
return None
|
||||
|
||||
async def authorize_request(self, method: str, url: str, user_token: Optional[str] = None) -> Dict[str, str]:
|
||||
headers = self.get_auth_headers(user_token)
|
||||
preferred = self.get_preferred_auth_scheme()
|
||||
|
||||
if preferred in ("oauth2", "oauth2_authorizationcode", "oauth2_clientcredentials"):
|
||||
oauth2_auth = await self.get_oauth2_auth()
|
||||
if oauth2_auth:
|
||||
headers.update(await oauth2_auth.authorize_request(method, url))
|
||||
|
||||
elif preferred == "openIdConnect":
|
||||
oidc_auth = await self.get_oidc_auth()
|
||||
if oidc_auth:
|
||||
headers.update(await oidc_auth.authorize_request(method, url))
|
||||
|
||||
return headers
|
||||
|
||||
_STATE_TRANSITIONS = {
|
||||
"SUBMITTED": {"WORKING", "FAILED", "CANCELED", "REJECTED", "AUTH_REQUIRED", "INPUT_REQUIRED", "COMPLETED"},
|
||||
"WORKING": {"COMPLETED", "FAILED", "CANCELED", "INPUT_REQUIRED", "AUTH_REQUIRED"},
|
||||
"INPUT_REQUIRED": {"WORKING", "FAILED", "CANCELED"},
|
||||
"AUTH_REQUIRED": {"WORKING", "FAILED", "CANCELED", "REJECTED"},
|
||||
"REJECTED": set(),
|
||||
"FAILED": set(),
|
||||
"COMPLETED": set(),
|
||||
"CANCELED": set(),
|
||||
}
|
||||
_TERMINAL_STATES = {"COMPLETED", "FAILED", "CANCELED", "REJECTED"}
|
||||
|
||||
|
||||
@dataclass
|
||||
class A2AResolvedRoute:
|
||||
selected: str
|
||||
fallback_chain: List[str]
|
||||
canary_hit: bool
|
||||
reason: str
|
||||
|
||||
|
||||
class A2AMetrics:
|
||||
def __init__(self) -> None:
|
||||
self._lock = asyncio.Lock()
|
||||
self._counters: Dict[str, int] = defaultdict(int)
|
||||
self._latency_ms: Dict[str, deque[float]] = defaultdict(lambda: deque(maxlen=2000))
|
||||
|
||||
async def incr(self, key: str, value: int = 1) -> None:
|
||||
async with self._lock:
|
||||
self._counters[key] += value
|
||||
|
||||
async def observe_latency(self, key: str, elapsed_ms: float) -> None:
|
||||
async with self._lock:
|
||||
self._latency_ms[key].append(float(elapsed_ms))
|
||||
|
||||
async def snapshot(self) -> Dict[str, Any]:
|
||||
async with self._lock:
|
||||
counters = dict(self._counters)
|
||||
p95 = {}
|
||||
for key, values in self._latency_ms.items():
|
||||
series = sorted(values)
|
||||
if not series:
|
||||
p95[f"{key}.p95_ms"] = 0.0
|
||||
continue
|
||||
idx = int(0.95 * (len(series) - 1))
|
||||
p95[f"{key}.p95_ms"] = round(series[idx], 2)
|
||||
total = counters.get("a2a.requests.total", 0)
|
||||
errors = counters.get("a2a.requests.error", 0)
|
||||
retries = counters.get("a2a.requests.retry", 0)
|
||||
breakers = counters.get("a2a.circuit.open", 0)
|
||||
return {
|
||||
"counters": counters,
|
||||
"derived": {
|
||||
"error_rate": round(errors / total, 4) if total else 0.0,
|
||||
"retry_rate": round(retries / total, 4) if total else 0.0,
|
||||
"circuit_open_rate": round(breakers / total, 4) if total else 0.0,
|
||||
},
|
||||
"latency": p95,
|
||||
}
|
||||
|
||||
|
||||
class A2ARuntime:
|
||||
def __init__(self) -> None:
|
||||
self._subscribers: Dict[str, List[asyncio.Queue[Dict[str, Any]]]] = defaultdict(list)
|
||||
self.metrics = A2AMetrics()
|
||||
self.protocol_version = "1.0"
|
||||
self._circuit_state: Dict[int, datetime] = {}
|
||||
|
||||
async def publish(self, task_id: str, event: Dict[str, Any]) -> None:
|
||||
queues = list(self._subscribers.get(task_id, []))
|
||||
for queue in queues:
|
||||
await queue.put(event)
|
||||
|
||||
async def subscribe(self, task_id: str) -> AsyncIterator[Dict[str, Any]]:
|
||||
queue: asyncio.Queue[Dict[str, Any]] = asyncio.Queue(maxsize=200)
|
||||
self._subscribers[task_id].append(queue)
|
||||
try:
|
||||
while True:
|
||||
payload = await queue.get()
|
||||
yield payload
|
||||
finally:
|
||||
self._subscribers[task_id] = [q for q in self._subscribers.get(task_id, []) if q is not queue]
|
||||
if not self._subscribers[task_id]:
|
||||
self._subscribers.pop(task_id, None)
|
||||
|
||||
def get_project_config(self, db: Session, project_id: int, user_id: int) -> A2AProjectConfig:
|
||||
item = db.query(A2AProjectConfig).filter(A2AProjectConfig.project_id == project_id).first()
|
||||
if item:
|
||||
return item
|
||||
config = A2AProjectConfig(project_id=project_id, updated_by=user_id)
|
||||
db.add(config)
|
||||
db.commit()
|
||||
db.refresh(config)
|
||||
return config
|
||||
|
||||
def resolve_route(self, *, project_config: A2AProjectConfig, session_id: str, requested_mode: str, requested_fallback: Optional[List[str]]) -> A2AResolvedRoute:
|
||||
selected = requested_mode or project_config.route_mode_default or "local_first"
|
||||
fallback = requested_fallback or _json_loads(project_config.fallback_chain_json, ["local"])
|
||||
fallback_chain = [item for item in fallback if item in {"a2a", "local", "mcp"}]
|
||||
if not fallback_chain:
|
||||
fallback_chain = ["local"]
|
||||
canary_hit = False
|
||||
if project_config.canary_enabled and project_config.canary_percent > 0:
|
||||
digest = hashlib.sha256(f"{project_config.project_id}:{session_id}".encode()).hexdigest()
|
||||
bucket = int(digest[:8], 16) % 100
|
||||
canary_hit = bucket < project_config.canary_percent
|
||||
if selected in {"a2a_first", "a2a"} and not canary_hit:
|
||||
return A2AResolvedRoute(
|
||||
selected="local",
|
||||
fallback_chain=fallback_chain,
|
||||
canary_hit=False,
|
||||
reason="canary_not_hit_fallback_local",
|
||||
)
|
||||
if selected in {"a2a_first", "a2a"}:
|
||||
return A2AResolvedRoute(selected="a2a", fallback_chain=fallback_chain, canary_hit=canary_hit, reason="a2a_selected")
|
||||
if selected in {"mcp_first", "mcp"}:
|
||||
return A2AResolvedRoute(selected="mcp", fallback_chain=fallback_chain, canary_hit=canary_hit, reason="mcp_selected")
|
||||
return A2AResolvedRoute(selected="local", fallback_chain=fallback_chain, canary_hit=canary_hit, reason="local_selected")
|
||||
|
||||
def can_transition(self, from_state: str, to_state: str) -> bool:
|
||||
if from_state == to_state:
|
||||
return True
|
||||
return to_state in _STATE_TRANSITIONS.get(from_state, set())
|
||||
|
||||
def create_task(
|
||||
self,
|
||||
db: Session,
|
||||
*,
|
||||
project_id: int,
|
||||
tenant_id: int,
|
||||
source: str,
|
||||
input_text: str,
|
||||
idempotency_key: Optional[str],
|
||||
remote_agent_id: Optional[int],
|
||||
compatibility_mode: bool,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
context_id: Optional[str] = None,
|
||||
) -> A2ATask:
|
||||
if idempotency_key:
|
||||
existing = (
|
||||
db.query(A2ATask)
|
||||
.filter(
|
||||
A2ATask.project_id == project_id,
|
||||
A2ATask.tenant_id == tenant_id,
|
||||
A2ATask.idempotency_key == idempotency_key,
|
||||
)
|
||||
.first()
|
||||
)
|
||||
if existing:
|
||||
return existing
|
||||
task = A2ATask(
|
||||
id=f"task_{uuid.uuid4().hex}",
|
||||
project_id=project_id,
|
||||
tenant_id=tenant_id,
|
||||
source=source,
|
||||
remote_agent_id=remote_agent_id,
|
||||
state="SUBMITTED",
|
||||
input_text=input_text,
|
||||
idempotency_key=idempotency_key,
|
||||
compatibility_mode=compatibility_mode,
|
||||
metadata_json=_json_dumps(metadata or {}),
|
||||
context_id=context_id,
|
||||
)
|
||||
db.add(task)
|
||||
db.commit()
|
||||
db.refresh(task)
|
||||
return task
|
||||
|
||||
def append_event(self, db: Session, task: A2ATask, event_type: str, payload: Dict[str, Any]) -> A2ATaskEvent:
|
||||
event = A2ATaskEvent(task_id=task.id, event_type=event_type, payload_json=_json_dumps(payload))
|
||||
db.add(event)
|
||||
db.commit()
|
||||
db.refresh(event)
|
||||
return event
|
||||
|
||||
def transition_task(
|
||||
self,
|
||||
db: Session,
|
||||
task: A2ATask,
|
||||
*,
|
||||
to_state: str,
|
||||
output_text: Optional[str] = None,
|
||||
error_message: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
) -> A2ATask:
|
||||
if not self.can_transition(task.state, to_state):
|
||||
raise ValueError(f"Invalid task transition: {task.state} -> {to_state}")
|
||||
task.state = to_state
|
||||
if output_text is not None:
|
||||
task.output_text = output_text
|
||||
if error_message is not None:
|
||||
task.error_message = error_message
|
||||
if metadata is not None:
|
||||
task.metadata_json = _json_dumps(metadata)
|
||||
if to_state in _TERMINAL_STATES:
|
||||
task.finished_at = _utc_now()
|
||||
db.add(task)
|
||||
db.commit()
|
||||
db.refresh(task)
|
||||
return task
|
||||
|
||||
def record_audit(
|
||||
self,
|
||||
db: Session,
|
||||
*,
|
||||
actor_user_id: int,
|
||||
action: str,
|
||||
target_type: str,
|
||||
target_id: str,
|
||||
result: str,
|
||||
project_id: Optional[int] = None,
|
||||
task_id: Optional[str] = None,
|
||||
detail: Optional[Dict[str, Any]] = None,
|
||||
) -> None:
|
||||
audit = A2AAuditLog(
|
||||
actor_user_id=actor_user_id,
|
||||
action=action,
|
||||
target_type=target_type,
|
||||
target_id=target_id,
|
||||
result=result,
|
||||
project_id=project_id,
|
||||
task_id=task_id,
|
||||
detail_json=_json_dumps(detail or {}),
|
||||
)
|
||||
db.add(audit)
|
||||
db.commit()
|
||||
|
||||
async def fetch_agent_card(self, db: Session, agent: A2ARemoteAgent, *, timeout_s: float = 10.0) -> Dict[str, Any]:
|
||||
if agent.id in self._circuit_state and self._circuit_state[agent.id] > _utc_now():
|
||||
raise RuntimeError("circuit_open")
|
||||
started = time.perf_counter()
|
||||
await self.metrics.incr("a2a.requests.total")
|
||||
headers = {}
|
||||
if agent.auth_scheme == "bearer" and agent.auth_token:
|
||||
headers["Authorization"] = f"Bearer {agent.auth_token}"
|
||||
url = f"{agent.base_url.rstrip('/')}/api/v1/a2a/agent-card"
|
||||
with trace_service.start_span("a2a.card.fetch", attributes={"agent_id": agent.id, "url": url}) as span:
|
||||
for attempt in range(3):
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=timeout_s, verify=True) as client:
|
||||
resp = await client.get(url, headers=headers)
|
||||
if resp.status_code >= 400:
|
||||
raise RuntimeError(f"http_{resp.status_code}")
|
||||
payload = resp.json()
|
||||
elapsed_ms = (time.perf_counter() - started) * 1000
|
||||
await self.metrics.observe_latency("a2a.card.fetch", elapsed_ms)
|
||||
agent.card_json = _json_dumps(payload)
|
||||
agent.protocol_version = str(payload.get("protocol_version") or "")
|
||||
agent.capabilities_json = _json_dumps(payload.get("capabilities") or [])
|
||||
agent.card_fetched_at = _utc_now()
|
||||
agent.healthy = True
|
||||
agent.failure_count = 0
|
||||
agent.circuit_open_until = None
|
||||
db.add(agent)
|
||||
db.commit()
|
||||
db.refresh(agent)
|
||||
return payload
|
||||
except Exception as exc:
|
||||
span.set_attributes(build_error_attributes(exc, stage="a2a_card_fetch"))
|
||||
await self.metrics.incr("a2a.requests.error")
|
||||
if attempt < 2:
|
||||
await self.metrics.incr("a2a.requests.retry")
|
||||
await asyncio.sleep(0.2 * (2 ** attempt))
|
||||
continue
|
||||
agent.failure_count = (agent.failure_count or 0) + 1
|
||||
if agent.failure_count >= 3:
|
||||
reopen_at = _utc_now() + timedelta(seconds=90)
|
||||
agent.circuit_open_until = reopen_at
|
||||
self._circuit_state[agent.id] = reopen_at
|
||||
await self.metrics.incr("a2a.circuit.open")
|
||||
agent.healthy = False
|
||||
db.add(agent)
|
||||
db.commit()
|
||||
raise
|
||||
|
||||
async def notify_webhooks(self, db: Session, task: A2ATask, event: A2ATaskEvent) -> None:
|
||||
webhooks = db.query(A2ATaskWebhook).filter(A2ATaskWebhook.task_id == task.id, A2ATaskWebhook.enabled == True).all()
|
||||
if not webhooks:
|
||||
return
|
||||
for hook in webhooks:
|
||||
delivery = A2AWebhookDelivery(task_id=task.id, webhook_id=hook.id, event_id=event.id, attempt=0, status="PENDING")
|
||||
db.add(delivery)
|
||||
db.commit()
|
||||
db.refresh(delivery)
|
||||
await self._deliver_once(db, hook, event, delivery)
|
||||
|
||||
async def _deliver_once(self, db: Session, hook: A2ATaskWebhook, event: A2ATaskEvent, delivery: A2AWebhookDelivery) -> None:
|
||||
event_payload = _json_loads(event.payload_json, {})
|
||||
stream_response_payload = self._build_stream_response_payload(event, event_payload)
|
||||
body = _json_dumps(stream_response_payload).encode("utf-8")
|
||||
|
||||
for attempt in range(1, 5):
|
||||
delivery.attempt = attempt
|
||||
db.add(delivery)
|
||||
db.commit()
|
||||
|
||||
headers = {"Content-Type": "application/json", "X-A2A-Event-Id": str(event.id)}
|
||||
if hook.secret:
|
||||
digest = hmac.new(hook.secret.encode("utf-8"), body, hashlib.sha256).hexdigest()
|
||||
headers["X-A2A-Signature"] = f"sha256={digest}"
|
||||
if hook.auth_header:
|
||||
headers["Authorization"] = hook.auth_header
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=8.0, verify=True) as client:
|
||||
resp = await client.post(hook.target_url, content=body, headers=headers)
|
||||
delivery.response_code = resp.status_code
|
||||
delivery.response_body = (resp.text or "")[:1000]
|
||||
if 200 <= resp.status_code < 300:
|
||||
delivery.status = "DELIVERED"
|
||||
delivery.dead_letter = False
|
||||
delivery.delivered_at = _utc_now()
|
||||
db.add(delivery)
|
||||
db.commit()
|
||||
return
|
||||
raise RuntimeError(f"http_{resp.status_code}")
|
||||
except Exception as exc:
|
||||
delivery.error_message = str(exc)[:500]
|
||||
if attempt < 4:
|
||||
backoff_seconds = 2 ** attempt
|
||||
delivery.status = "RETRYING"
|
||||
delivery.next_retry_at = _utc_now() + timedelta(seconds=backoff_seconds)
|
||||
db.add(delivery)
|
||||
db.commit()
|
||||
await asyncio.sleep(backoff_seconds)
|
||||
continue
|
||||
delivery.status = "FAILED"
|
||||
delivery.dead_letter = True
|
||||
db.add(delivery)
|
||||
db.commit()
|
||||
return
|
||||
|
||||
def _build_stream_response_payload(self, event: A2ATaskEvent, event_payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
event_type = event.event_type
|
||||
task_id = event_payload.get("task_id", event.task_id)
|
||||
|
||||
if event_type == "TaskStatusUpdateEvent":
|
||||
status_state = event_payload.get("task_status", "WORKING")
|
||||
status_timestamp = event_payload.get("timestamp", _utc_now().isoformat())
|
||||
status_schema = A2ATaskStatusSchema(
|
||||
state=status_state,
|
||||
timestamp=datetime.fromisoformat(status_timestamp) if isinstance(status_timestamp, str) else status_timestamp,
|
||||
)
|
||||
return {
|
||||
"statusUpdate": TaskStatusUpdateEvent(
|
||||
taskId=task_id,
|
||||
contextId=event_payload.get("context_id"),
|
||||
status=status_schema,
|
||||
metadata=event_payload.get("metadata", {}),
|
||||
).model_dump()
|
||||
}
|
||||
elif event_type == "TaskArtifactUpdateEvent":
|
||||
artifact_content = event_payload.get("artifact", {}).get("content", "")
|
||||
artifact_schema = A2AArtifactSchema(
|
||||
artifactId=f"artifact-{event.id}",
|
||||
parts=[A2APartSchema(part_type="text", text=artifact_content)],
|
||||
)
|
||||
return {
|
||||
"artifactUpdate": TaskArtifactUpdateEvent(
|
||||
taskId=task_id,
|
||||
contextId=event_payload.get("context_id"),
|
||||
artifact=artifact_schema,
|
||||
append=False,
|
||||
lastChunk=True,
|
||||
).model_dump()
|
||||
}
|
||||
else:
|
||||
return {"message": event_payload}
|
||||
|
||||
|
||||
a2a_runtime = A2ARuntime()
|
||||
@@ -0,0 +1,77 @@
|
||||
import json
|
||||
import threading
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.core.data_root import get_data_root
|
||||
|
||||
class EmbeddingModelStore:
|
||||
def __init__(self) -> None:
|
||||
self._lock = threading.RLock()
|
||||
|
||||
@staticmethod
|
||||
def _file_path() -> Path:
|
||||
return get_data_root() / "embedding_models.json"
|
||||
|
||||
def _read(self) -> List[Dict[str, Any]]:
|
||||
file_path = self._file_path()
|
||||
if not file_path.exists():
|
||||
return []
|
||||
try:
|
||||
with file_path.open("r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return []
|
||||
if not isinstance(data, list):
|
||||
return []
|
||||
return data
|
||||
|
||||
def _write(self, data: List[Dict[str, Any]]) -> None:
|
||||
file_path = self._file_path()
|
||||
file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with file_path.open("w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
def list_models(self) -> List[Dict[str, Any]]:
|
||||
with self._lock:
|
||||
return self._read()
|
||||
|
||||
def get_model(self, model_id: str) -> Optional[Dict[str, Any]]:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
for item in data:
|
||||
if item.get("id") == model_id:
|
||||
return item
|
||||
return None
|
||||
|
||||
def create_model(self, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
new_model = payload.copy()
|
||||
new_model["id"] = uuid.uuid4().hex
|
||||
data.append(new_model)
|
||||
self._write(data)
|
||||
return new_model
|
||||
|
||||
def update_model(self, model_id: str, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
for item in data:
|
||||
if item.get("id") == model_id:
|
||||
item.update(payload)
|
||||
self._write(data)
|
||||
return item
|
||||
return None
|
||||
|
||||
def delete_model(self, model_id: str) -> bool:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
initial_len = len(data)
|
||||
data = [item for item in data if item.get("id") != model_id]
|
||||
if len(data) < initial_len:
|
||||
self._write(data)
|
||||
return True
|
||||
return False
|
||||
|
||||
embedding_model_store = EmbeddingModelStore()
|
||||
@@ -0,0 +1,188 @@
|
||||
import json
|
||||
import threading
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.core.data_root import get_data_root
|
||||
|
||||
|
||||
def _utcnow_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
class KnowledgeBaseStore:
|
||||
def __init__(self) -> None:
|
||||
self._lock = threading.RLock()
|
||||
|
||||
@staticmethod
|
||||
def _file_path() -> Path:
|
||||
return get_data_root() / "knowledge_bases.json"
|
||||
|
||||
def _read(self) -> List[Dict[str, Any]]:
|
||||
file_path = self._file_path()
|
||||
if not file_path.exists():
|
||||
return []
|
||||
try:
|
||||
with file_path.open("r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return []
|
||||
if not isinstance(data, list):
|
||||
return []
|
||||
return data
|
||||
|
||||
def _write(self, data: List[Dict[str, Any]]) -> None:
|
||||
file_path = self._file_path()
|
||||
file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with file_path.open("w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
|
||||
@staticmethod
|
||||
def _normalize_documents(item: Dict[str, Any]) -> None:
|
||||
docs = item.get("documents")
|
||||
if not isinstance(docs, list):
|
||||
item["documents"] = []
|
||||
return
|
||||
normalized: List[Dict[str, Any]] = []
|
||||
for doc in docs:
|
||||
if not isinstance(doc, dict):
|
||||
continue
|
||||
if not doc.get("id"):
|
||||
doc["id"] = str(uuid.uuid4())
|
||||
now = _utcnow_iso()
|
||||
doc.setdefault("created_at", now)
|
||||
doc.setdefault("updated_at", now)
|
||||
doc.setdefault("metadata", {})
|
||||
normalized.append(doc)
|
||||
item["documents"] = normalized
|
||||
|
||||
def list(self, project_id: Optional[int] = None) -> List[Dict[str, Any]]:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
for item in data:
|
||||
self._normalize_documents(item)
|
||||
if project_id is None:
|
||||
return data
|
||||
return [item for item in data if item.get("project_id") == project_id]
|
||||
|
||||
def get(self, kb_id: str) -> Optional[Dict[str, Any]]:
|
||||
with self._lock:
|
||||
for item in self._read():
|
||||
if item.get("id") == kb_id:
|
||||
self._normalize_documents(item)
|
||||
return item
|
||||
return None
|
||||
|
||||
def create(self, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
now = _utcnow_iso()
|
||||
item = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"name": payload["name"],
|
||||
"description": payload.get("description"),
|
||||
"project_id": payload.get("project_id"),
|
||||
"embedding_model": payload.get("embedding_model"),
|
||||
"chunk_size": payload.get("chunk_size", 512),
|
||||
"chunk_overlap": payload.get("chunk_overlap", 50),
|
||||
"top_k": payload.get("top_k", 3),
|
||||
"is_active": payload.get("is_active", True),
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
"documents": [],
|
||||
}
|
||||
data.append(item)
|
||||
self._write(data)
|
||||
return item
|
||||
|
||||
def update(self, kb_id: str, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
for idx, item in enumerate(data):
|
||||
if item.get("id") != kb_id:
|
||||
continue
|
||||
for key, value in payload.items():
|
||||
item[key] = value
|
||||
item["updated_at"] = _utcnow_iso()
|
||||
self._normalize_documents(item)
|
||||
data[idx] = item
|
||||
self._write(data)
|
||||
return item
|
||||
return None
|
||||
|
||||
def delete(self, kb_id: str) -> bool:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
filtered = [item for item in data if item.get("id") != kb_id]
|
||||
if len(filtered) == len(data):
|
||||
return False
|
||||
self._write(filtered)
|
||||
return True
|
||||
|
||||
def create_document(self, kb_id: str, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
for idx, item in enumerate(data):
|
||||
if item.get("id") != kb_id:
|
||||
continue
|
||||
now = _utcnow_iso()
|
||||
doc = {
|
||||
"id": str(uuid.uuid4()),
|
||||
"title": payload["title"],
|
||||
"content": payload["content"],
|
||||
"metadata": payload.get("metadata", {}),
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
}
|
||||
self._normalize_documents(item)
|
||||
item["documents"].append(doc)
|
||||
item["updated_at"] = now
|
||||
data[idx] = item
|
||||
self._write(data)
|
||||
return doc
|
||||
return None
|
||||
|
||||
def update_document(self, kb_id: str, doc_id: str, payload: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
for kb_idx, item in enumerate(data):
|
||||
if item.get("id") != kb_id:
|
||||
continue
|
||||
self._normalize_documents(item)
|
||||
docs = item["documents"]
|
||||
for doc_idx, doc in enumerate(docs):
|
||||
if doc.get("id") != doc_id:
|
||||
continue
|
||||
for key, value in payload.items():
|
||||
doc[key] = value
|
||||
doc["updated_at"] = _utcnow_iso()
|
||||
docs[doc_idx] = doc
|
||||
item["updated_at"] = _utcnow_iso()
|
||||
data[kb_idx] = item
|
||||
self._write(data)
|
||||
return doc
|
||||
return None
|
||||
return None
|
||||
|
||||
def delete_document(self, kb_id: str, doc_id: str) -> bool:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
for kb_idx, item in enumerate(data):
|
||||
if item.get("id") != kb_id:
|
||||
continue
|
||||
self._normalize_documents(item)
|
||||
docs = item["documents"]
|
||||
filtered = [doc for doc in docs if doc.get("id") != doc_id]
|
||||
if len(filtered) == len(docs):
|
||||
return False
|
||||
item["documents"] = filtered
|
||||
item["updated_at"] = _utcnow_iso()
|
||||
data[kb_idx] = item
|
||||
self._write(data)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
knowledge_base_store = KnowledgeBaseStore()
|
||||
@@ -0,0 +1,58 @@
|
||||
import json
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict
|
||||
|
||||
from app.core.data_root import get_data_root
|
||||
|
||||
|
||||
class KnowledgeGlobalConfigStore:
|
||||
def __init__(self) -> None:
|
||||
self._lock = threading.RLock()
|
||||
|
||||
@staticmethod
|
||||
def _file_path() -> Path:
|
||||
return get_data_root() / "knowledge_global_config.json"
|
||||
|
||||
def _read(self) -> Dict[str, Any]:
|
||||
file_path = self._file_path()
|
||||
if not file_path.exists():
|
||||
return {}
|
||||
try:
|
||||
with file_path.open("r", encoding="utf-8") as file_obj:
|
||||
data = json.load(file_obj)
|
||||
except (OSError, json.JSONDecodeError):
|
||||
return {}
|
||||
if not isinstance(data, dict):
|
||||
return {}
|
||||
return data
|
||||
|
||||
def _write(self, data: Dict[str, Any]) -> None:
|
||||
file_path = self._file_path()
|
||||
file_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with file_path.open("w", encoding="utf-8") as file_obj:
|
||||
json.dump(data, file_obj, indent=2, ensure_ascii=False)
|
||||
|
||||
def get(self) -> Dict[str, Any]:
|
||||
with self._lock:
|
||||
data = self._read()
|
||||
return {
|
||||
"api_base": data.get("api_base"),
|
||||
"api_key": data.get("api_key"),
|
||||
"default_embedding_model": data.get("default_embedding_model"),
|
||||
}
|
||||
|
||||
def update(self, payload: Dict[str, Any]) -> Dict[str, Any]:
|
||||
with self._lock:
|
||||
current = self.get()
|
||||
if "api_base" in payload:
|
||||
current["api_base"] = payload.get("api_base")
|
||||
if "api_key" in payload:
|
||||
current["api_key"] = payload.get("api_key")
|
||||
if "default_embedding_model" in payload:
|
||||
current["default_embedding_model"] = payload.get("default_embedding_model")
|
||||
self._write(current)
|
||||
return current
|
||||
|
||||
|
||||
knowledge_global_config_store = KnowledgeGlobalConfigStore()
|
||||
@@ -0,0 +1,267 @@
|
||||
import math
|
||||
import re
|
||||
import threading
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
from app.services.knowledge_base_store import knowledge_base_store
|
||||
from app.services.knowledge_global_config_store import knowledge_global_config_store
|
||||
from app.services.openai_compat import normalize_openai_base_url
|
||||
|
||||
try:
|
||||
from llama_index.core import Document, VectorStoreIndex
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
LLAMAINDEX_AVAILABLE = True
|
||||
except Exception:
|
||||
Document = Any
|
||||
VectorStoreIndex = Any
|
||||
SentenceSplitter = Any
|
||||
LLAMAINDEX_AVAILABLE = False
|
||||
|
||||
|
||||
def _tokenize(text: str) -> List[str]:
|
||||
return re.findall(r"[a-zA-Z0-9]+|[\u4e00-\u9fff]", (text or "").lower())
|
||||
|
||||
|
||||
def _normalize_embedding_api_base(api_base: str) -> str:
|
||||
return normalize_openai_base_url(api_base)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchHit:
|
||||
doc_id: str
|
||||
title: str
|
||||
chunk: str
|
||||
score: float
|
||||
metadata: Dict[str, Any]
|
||||
|
||||
|
||||
class KnowledgeIndexService:
|
||||
def __init__(self) -> None:
|
||||
self._lock = threading.RLock()
|
||||
self._cache: Dict[str, Tuple[str, Any, List[Dict[str, Any]]]] = {}
|
||||
|
||||
@staticmethod
|
||||
def _signature(kb: Dict[str, Any]) -> str:
|
||||
doc_parts = []
|
||||
for doc in kb.get("documents", []):
|
||||
doc_parts.append(f"{doc.get('id')}:{doc.get('updated_at')}:{len(doc.get('content', ''))}")
|
||||
return "|".join(
|
||||
[
|
||||
str(kb.get("updated_at")),
|
||||
str(kb.get("chunk_size")),
|
||||
str(kb.get("chunk_overlap")),
|
||||
*doc_parts,
|
||||
]
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _fallback_chunks(kb: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
chunks: List[Dict[str, Any]] = []
|
||||
chunk_size = int(kb.get("chunk_size") or 512)
|
||||
overlap = int(kb.get("chunk_overlap") or 50)
|
||||
step = max(1, chunk_size - overlap)
|
||||
for doc in kb.get("documents", []):
|
||||
text = doc.get("content") or ""
|
||||
if not text:
|
||||
continue
|
||||
if len(text) <= chunk_size:
|
||||
chunks.append(
|
||||
{
|
||||
"doc_id": doc.get("id", ""),
|
||||
"title": doc.get("title", ""),
|
||||
"chunk": text,
|
||||
"metadata": doc.get("metadata") or {},
|
||||
}
|
||||
)
|
||||
continue
|
||||
for start in range(0, len(text), step):
|
||||
piece = text[start : start + chunk_size]
|
||||
if not piece:
|
||||
continue
|
||||
chunks.append(
|
||||
{
|
||||
"doc_id": doc.get("id", ""),
|
||||
"title": doc.get("title", ""),
|
||||
"chunk": piece,
|
||||
"metadata": doc.get("metadata") or {},
|
||||
}
|
||||
)
|
||||
return chunks
|
||||
|
||||
def _build_index(self, kb: Dict[str, Any]) -> Tuple[Any, List[Dict[str, Any]]]:
|
||||
fallback_chunks = self._fallback_chunks(kb)
|
||||
if not LLAMAINDEX_AVAILABLE:
|
||||
return None, fallback_chunks
|
||||
chunk_size = int(kb.get("chunk_size") or 512)
|
||||
overlap = int(kb.get("chunk_overlap") or 50)
|
||||
splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
|
||||
docs = [
|
||||
Document(
|
||||
text=(doc.get("content") or ""),
|
||||
metadata={
|
||||
"doc_id": doc.get("id", ""),
|
||||
"title": doc.get("title", ""),
|
||||
**(doc.get("metadata") or {}),
|
||||
},
|
||||
)
|
||||
for doc in kb.get("documents", [])
|
||||
if (doc.get("content") or "").strip()
|
||||
]
|
||||
if not docs:
|
||||
return None, fallback_chunks
|
||||
embed_model = self._build_embed_model(kb)
|
||||
if embed_model is not None:
|
||||
index = VectorStoreIndex.from_documents(
|
||||
docs,
|
||||
transformations=[splitter],
|
||||
embed_model=embed_model,
|
||||
)
|
||||
else:
|
||||
index = VectorStoreIndex.from_documents(docs, transformations=[splitter])
|
||||
return index, fallback_chunks
|
||||
|
||||
@staticmethod
|
||||
def _build_embed_model(kb: Dict[str, Any]) -> Any:
|
||||
from app.services.embedding_model_store import embedding_model_store
|
||||
models = embedding_model_store.list_models()
|
||||
if not models:
|
||||
return None
|
||||
|
||||
target_model = None
|
||||
kb_model_val = kb.get("embedding_model")
|
||||
if kb_model_val:
|
||||
# Try matching by ID first, then by model name
|
||||
target_model = next((m for m in models if m.get("id") == kb_model_val), None)
|
||||
if not target_model:
|
||||
target_model = next((m for m in models if m.get("model") == kb_model_val), None)
|
||||
|
||||
if not target_model:
|
||||
# Fallback to the first model
|
||||
target_model = models[0]
|
||||
|
||||
api_base = target_model.get("api_base")
|
||||
api_key = target_model.get("api_key")
|
||||
model_name = target_model.get("model")
|
||||
|
||||
if not api_base or not api_key or not model_name:
|
||||
return None
|
||||
api_base = _normalize_embedding_api_base(api_base)
|
||||
try:
|
||||
from llama_index.embeddings.openai_like import OpenAILikeEmbedding
|
||||
|
||||
return OpenAILikeEmbedding(
|
||||
model_name=model_name,
|
||||
api_base=api_base,
|
||||
api_key=api_key,
|
||||
embed_batch_size=10,
|
||||
)
|
||||
except Exception:
|
||||
try:
|
||||
from llama_index.embeddings.openai import OpenAIEmbedding
|
||||
|
||||
return OpenAIEmbedding(
|
||||
model_name=model_name,
|
||||
api_base=api_base,
|
||||
api_key=api_key,
|
||||
embed_batch_size=10,
|
||||
)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def reindex(self, kb_id: str) -> Dict[str, Any]:
|
||||
kb = knowledge_base_store.get(kb_id)
|
||||
if not kb:
|
||||
raise ValueError("Knowledge base not found")
|
||||
with self._lock:
|
||||
signature = self._signature(kb)
|
||||
index, fallback_chunks = self._build_index(kb)
|
||||
self._cache[kb_id] = (signature, index, fallback_chunks)
|
||||
return {
|
||||
"kb_id": kb_id,
|
||||
"status": "ok",
|
||||
"documents": len(kb.get("documents", [])),
|
||||
"engine": "llamaindex" if LLAMAINDEX_AVAILABLE and index is not None else "fallback",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _fallback_search(query: str, chunks: List[Dict[str, Any]], top_k: int) -> List[SearchHit]:
|
||||
q_tokens = _tokenize(query)
|
||||
if not q_tokens:
|
||||
return []
|
||||
q_set = set(q_tokens)
|
||||
scored: List[SearchHit] = []
|
||||
for chunk_item in chunks:
|
||||
c_tokens = _tokenize(chunk_item.get("chunk", ""))
|
||||
if not c_tokens:
|
||||
continue
|
||||
overlap = sum(1 for t in c_tokens if t in q_set)
|
||||
if overlap == 0:
|
||||
continue
|
||||
score = overlap / math.sqrt(len(c_tokens))
|
||||
scored.append(
|
||||
SearchHit(
|
||||
doc_id=chunk_item.get("doc_id", ""),
|
||||
title=chunk_item.get("title", ""),
|
||||
chunk=chunk_item.get("chunk", ""),
|
||||
score=float(score),
|
||||
metadata=chunk_item.get("metadata") or {},
|
||||
)
|
||||
)
|
||||
scored.sort(key=lambda x: x.score, reverse=True)
|
||||
return scored[:top_k]
|
||||
|
||||
def search(self, kb_id: str, query: str, top_k: int | None = None) -> Dict[str, Any]:
|
||||
kb = knowledge_base_store.get(kb_id)
|
||||
if not kb:
|
||||
raise ValueError("Knowledge base not found")
|
||||
if not kb.get("documents"):
|
||||
return {"answer": "", "hits": []}
|
||||
effective_top_k = int(top_k or kb.get("top_k") or 3)
|
||||
with self._lock:
|
||||
signature = self._signature(kb)
|
||||
cached = self._cache.get(kb_id)
|
||||
if not cached or cached[0] != signature:
|
||||
index, fallback_chunks = self._build_index(kb)
|
||||
cached = (signature, index, fallback_chunks)
|
||||
self._cache[kb_id] = cached
|
||||
_, index, fallback_chunks = cached
|
||||
if index is None:
|
||||
hits = self._fallback_search(query=query, chunks=fallback_chunks, top_k=effective_top_k)
|
||||
answer = "\n\n".join(hit.chunk for hit in hits)
|
||||
return {
|
||||
"answer": answer,
|
||||
"hits": [hit.__dict__ for hit in hits],
|
||||
}
|
||||
retriever = index.as_retriever(similarity_top_k=effective_top_k)
|
||||
response_nodes = retriever.retrieve(query)
|
||||
hits: List[Dict[str, Any]] = []
|
||||
for node_with_score in response_nodes:
|
||||
node = getattr(node_with_score, "node", None)
|
||||
metadata = getattr(node, "metadata", {}) if node is not None else {}
|
||||
chunk_text = ""
|
||||
if node is not None and hasattr(node, "get_content"):
|
||||
chunk_text = node.get_content()
|
||||
elif node is not None:
|
||||
chunk_text = str(getattr(node, "text", ""))
|
||||
hits.append(
|
||||
{
|
||||
"doc_id": metadata.get("doc_id", ""),
|
||||
"title": metadata.get("title", ""),
|
||||
"chunk": chunk_text,
|
||||
"score": float(getattr(node_with_score, "score", 0.0) or 0.0),
|
||||
"metadata": metadata,
|
||||
}
|
||||
)
|
||||
if not hits:
|
||||
fallback_hits = self._fallback_search(query=query, chunks=fallback_chunks, top_k=effective_top_k)
|
||||
return {
|
||||
"answer": "\n\n".join(hit.chunk for hit in fallback_hits),
|
||||
"hits": [hit.__dict__ for hit in fallback_hits],
|
||||
}
|
||||
answer = "\n\n".join(item.get("chunk", "") for item in hits if item.get("chunk"))
|
||||
return {"answer": answer, "hits": hits}
|
||||
|
||||
|
||||
knowledge_index_service = KnowledgeIndexService()
|
||||
@@ -0,0 +1,24 @@
|
||||
import os
|
||||
import threading
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from app.api.llm import DATA_FILE, _load_data
|
||||
|
||||
_cache_lock = threading.RLock()
|
||||
_cache_mtime: float = -1.0
|
||||
_cache_data: List[Dict[str, Any]] = []
|
||||
|
||||
|
||||
def get_llm_configs() -> List[Dict[str, Any]]:
|
||||
global _cache_mtime, _cache_data
|
||||
current_mtime = os.path.getmtime(DATA_FILE) if os.path.exists(DATA_FILE) else -1.0
|
||||
with _cache_lock:
|
||||
if current_mtime != _cache_mtime:
|
||||
_cache_data = _load_data()
|
||||
_cache_mtime = current_mtime
|
||||
return list(_cache_data)
|
||||
|
||||
|
||||
def get_active_llm_config() -> Optional[Dict[str, Any]]:
|
||||
configs = get_llm_configs()
|
||||
return next((c for c in configs if c.get("is_active")), None)
|
||||
@@ -0,0 +1,169 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional, Dict, Any, List
|
||||
from app.models.datasource import DataSource
|
||||
from app.schemas.mdl import MDLManifest, Model, Column, TableReference
|
||||
from app.connectors.factory import get_connector
|
||||
from app.database import SessionLocal
|
||||
from app.core.data_root import get_data_root
|
||||
|
||||
MDL_STORAGE_PATH = get_data_root() / "mdl"
|
||||
|
||||
class MDLService:
|
||||
@staticmethod
|
||||
def _get_mdl_path(datasource_id: int) -> Path:
|
||||
MDL_STORAGE_PATH.mkdir(parents=True, exist_ok=True)
|
||||
return MDL_STORAGE_PATH / f"{datasource_id}.json"
|
||||
|
||||
@staticmethod
|
||||
def get_raw_schema(datasource: DataSource) -> Dict[str, List[Dict[str, str]]]:
|
||||
connector = get_connector(datasource)
|
||||
try:
|
||||
return connector.get_schema()
|
||||
except Exception as e:
|
||||
print(f"Error fetching schema for DS {datasource.id}: {e}")
|
||||
return {}
|
||||
|
||||
@staticmethod
|
||||
def generate_default_mdl(
|
||||
datasource: DataSource,
|
||||
selected_tables: Optional[List[str]] = None,
|
||||
selected_columns: Optional[Dict[str, List[str]]] = None,
|
||||
) -> MDLManifest:
|
||||
raw_schema = MDLService.get_raw_schema(datasource)
|
||||
|
||||
models = []
|
||||
relationships = []
|
||||
from app.schemas.mdl import Relationship
|
||||
|
||||
# Helper to get columns for a table from the raw schema (which could be a list or a dict)
|
||||
def get_table_info(t_name):
|
||||
data = raw_schema.get(t_name, [])
|
||||
if isinstance(data, dict) and "columns" in data:
|
||||
return data
|
||||
return {"columns": data, "primary_keys": [], "foreign_keys": []}
|
||||
|
||||
for table_name in raw_schema.keys():
|
||||
if selected_tables is not None and table_name not in selected_tables:
|
||||
continue
|
||||
|
||||
table_info = get_table_info(table_name)
|
||||
columns = table_info["columns"]
|
||||
pks = table_info.get("primary_keys", [])
|
||||
|
||||
model_cols = []
|
||||
for col_info in columns:
|
||||
if isinstance(col_info, dict):
|
||||
name = col_info.get("name", "UNKNOWN")
|
||||
type_ = col_info.get("type", "UNKNOWN")
|
||||
elif isinstance(col_info, str):
|
||||
# Fallback for old string format "name (type)"
|
||||
if "(" in col_info and col_info.endswith(")"):
|
||||
parts = col_info.rsplit(" (", 1)
|
||||
if len(parts) == 2:
|
||||
name = parts[0]
|
||||
type_ = parts[1][:-1]
|
||||
else:
|
||||
name = col_info
|
||||
type_ = "UNKNOWN"
|
||||
else:
|
||||
name = col_info
|
||||
type_ = "UNKNOWN"
|
||||
else:
|
||||
name = str(col_info)
|
||||
type_ = "UNKNOWN"
|
||||
|
||||
if selected_columns is not None:
|
||||
allowed = selected_columns.get(table_name, [])
|
||||
if allowed and name not in allowed:
|
||||
continue
|
||||
|
||||
is_pk = name in pks
|
||||
model_cols.append(Column(name=name, type=type_, properties={"is_primary_key": is_pk}))
|
||||
|
||||
if not model_cols:
|
||||
continue
|
||||
|
||||
models.append(Model(
|
||||
name=table_name,
|
||||
tableReference=TableReference(table=table_name),
|
||||
columns=model_cols,
|
||||
primaryKey=pks[0] if pks else None
|
||||
))
|
||||
|
||||
# Extract relationships from foreign keys
|
||||
fks = table_info.get("foreign_keys", [])
|
||||
for fk in fks:
|
||||
referred_table = fk.get("referred_table")
|
||||
if not referred_table:
|
||||
continue
|
||||
# Skip if the referred table is not selected
|
||||
if selected_tables is not None and referred_table not in selected_tables:
|
||||
continue
|
||||
|
||||
constrained_cols = fk.get("constrained_columns", [])
|
||||
referred_cols = fk.get("referred_columns", [])
|
||||
|
||||
if len(constrained_cols) == 1 and len(referred_cols) == 1:
|
||||
# Update column properties for FK
|
||||
fk_col_name = constrained_cols[0]
|
||||
for col in model_cols:
|
||||
if col.name == fk_col_name:
|
||||
col.properties["is_foreign_key"] = True
|
||||
|
||||
# Simple single-column foreign key
|
||||
condition = f"{table_name}.{constrained_cols[0]} = {referred_table}.{referred_cols[0]}"
|
||||
rel_name = f"{table_name}_{constrained_cols[0]}_to_{referred_table}"
|
||||
relationships.append(Relationship(
|
||||
name=rel_name,
|
||||
models=[table_name, referred_table],
|
||||
joinType="MANY_TO_ONE", # typically a foreign key represents many-to-one
|
||||
condition=condition
|
||||
))
|
||||
|
||||
return MDLManifest(
|
||||
catalog="default",
|
||||
schema="public", # Default schema, might need adjustment based on datasource config
|
||||
dataSource=datasource.type.upper(),
|
||||
models=models,
|
||||
relationships=relationships
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_mdl(datasource_id: int) -> Optional[MDLManifest]:
|
||||
path = MDLService._get_mdl_path(datasource_id)
|
||||
if path.exists():
|
||||
try:
|
||||
with open(path, "r") as f:
|
||||
data = json.load(f)
|
||||
# Pydantic v2 compatible
|
||||
return MDLManifest.model_validate(data)
|
||||
except Exception as e:
|
||||
print(f"Error loading MDL for {datasource_id}: {e}")
|
||||
return None
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def save_mdl(datasource_id: int, mdl: MDLManifest):
|
||||
path = MDLService._get_mdl_path(datasource_id)
|
||||
with open(path, "w") as f:
|
||||
f.write(mdl.model_dump_json(indent=2, by_alias=True))
|
||||
|
||||
@staticmethod
|
||||
def get_or_create_mdl(datasource_id: int) -> MDLManifest:
|
||||
mdl = MDLService.get_mdl(datasource_id)
|
||||
if mdl:
|
||||
return mdl
|
||||
|
||||
# Generate new
|
||||
db = SessionLocal()
|
||||
try:
|
||||
ds = db.query(DataSource).filter(DataSource.id == datasource_id).first()
|
||||
if not ds:
|
||||
raise ValueError(f"DataSource {datasource_id} not found")
|
||||
mdl = MDLService.generate_default_mdl(ds)
|
||||
MDLService.save_mdl(datasource_id, mdl)
|
||||
return mdl
|
||||
finally:
|
||||
db.close()
|
||||
@@ -0,0 +1,5 @@
|
||||
def normalize_openai_base_url(api_base: str) -> str:
|
||||
normalized = (api_base or "").strip().rstrip("/")
|
||||
if normalized.lower().endswith("/embeddings"):
|
||||
normalized = normalized[: -len("/embeddings")]
|
||||
return normalized
|
||||
@@ -0,0 +1,51 @@
|
||||
import os
|
||||
import json
|
||||
import threading
|
||||
from typing import Any, Dict
|
||||
|
||||
from app.core.data_root import get_data_root
|
||||
|
||||
_cache_lock = threading.RLock()
|
||||
_cache_mtime: float = -1.0
|
||||
_cache_data: Dict[str, Any] = {}
|
||||
|
||||
def get_config_file_path() -> str:
|
||||
return str(get_data_root() / "web_search_config.json")
|
||||
|
||||
def get_web_search_config() -> Dict[str, Any]:
|
||||
global _cache_mtime, _cache_data
|
||||
config_file = get_config_file_path()
|
||||
current_mtime = os.path.getmtime(config_file) if os.path.exists(config_file) else -1.0
|
||||
|
||||
with _cache_lock:
|
||||
if current_mtime != _cache_mtime:
|
||||
if not os.path.exists(config_file):
|
||||
_cache_data = {
|
||||
"provider": "duckduckgo",
|
||||
"api_key": "",
|
||||
"base_url": "",
|
||||
"max_results": 5
|
||||
}
|
||||
else:
|
||||
try:
|
||||
with open(config_file, "r") as f:
|
||||
_cache_data = json.load(f)
|
||||
except json.JSONDecodeError:
|
||||
_cache_data = {
|
||||
"provider": "duckduckgo",
|
||||
"api_key": "",
|
||||
"base_url": "",
|
||||
"max_results": 5
|
||||
}
|
||||
_cache_mtime = current_mtime
|
||||
return dict(_cache_data)
|
||||
|
||||
def save_web_search_config(config: Dict[str, Any]) -> None:
|
||||
global _cache_mtime, _cache_data
|
||||
config_file = get_config_file_path()
|
||||
os.makedirs(os.path.dirname(config_file), exist_ok=True)
|
||||
with _cache_lock:
|
||||
with open(config_file, "w") as f:
|
||||
json.dump(config, f, indent=2)
|
||||
_cache_data = dict(config)
|
||||
_cache_mtime = os.path.getmtime(config_file)
|
||||
@@ -0,0 +1,177 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
@@ -0,0 +1,42 @@
|
||||
---
|
||||
name: frontend-design
|
||||
description: Create distinctive, production-grade frontend interfaces with high design quality. Use this skill when the user asks to build web components, pages, artifacts, posters, or applications (examples include websites, landing pages, dashboards, React components, HTML/CSS layouts, or when styling/beautifying any web UI). Generates creative, polished code and UI design that avoids generic AI aesthetics.
|
||||
license: Complete terms in LICENSE.txt
|
||||
---
|
||||
|
||||
This skill guides creation of distinctive, production-grade frontend interfaces that avoid generic "AI slop" aesthetics. Implement real working code with exceptional attention to aesthetic details and creative choices.
|
||||
|
||||
The user provides frontend requirements: a component, page, application, or interface to build. They may include context about the purpose, audience, or technical constraints.
|
||||
|
||||
## Design Thinking
|
||||
|
||||
Before coding, understand the context and commit to a BOLD aesthetic direction:
|
||||
- **Purpose**: What problem does this interface solve? Who uses it?
|
||||
- **Tone**: Pick an extreme: brutally minimal, maximalist chaos, retro-futuristic, organic/natural, luxury/refined, playful/toy-like, editorial/magazine, brutalist/raw, art deco/geometric, soft/pastel, industrial/utilitarian, etc. There are so many flavors to choose from. Use these for inspiration but design one that is true to the aesthetic direction.
|
||||
- **Constraints**: Technical requirements (framework, performance, accessibility).
|
||||
- **Differentiation**: What makes this UNFORGETTABLE? What's the one thing someone will remember?
|
||||
|
||||
**CRITICAL**: Choose a clear conceptual direction and execute it with precision. Bold maximalism and refined minimalism both work - the key is intentionality, not intensity.
|
||||
|
||||
Then implement working code (HTML/CSS/JS, React, Vue, etc.) that is:
|
||||
- Production-grade and functional
|
||||
- Visually striking and memorable
|
||||
- Cohesive with a clear aesthetic point-of-view
|
||||
- Meticulously refined in every detail
|
||||
|
||||
## Frontend Aesthetics Guidelines
|
||||
|
||||
Focus on:
|
||||
- **Typography**: Choose fonts that are beautiful, unique, and interesting. Avoid generic fonts like Arial and Inter; opt instead for distinctive choices that elevate the frontend's aesthetics; unexpected, characterful font choices. Pair a distinctive display font with a refined body font.
|
||||
- **Color & Theme**: Commit to a cohesive aesthetic. Use CSS variables for consistency. Dominant colors with sharp accents outperform timid, evenly-distributed palettes.
|
||||
- **Motion**: Use animations for effects and micro-interactions. Prioritize CSS-only solutions for HTML. Use Motion library for React when available. Focus on high-impact moments: one well-orchestrated page load with staggered reveals (animation-delay) creates more delight than scattered micro-interactions. Use scroll-triggering and hover states that surprise.
|
||||
- **Spatial Composition**: Unexpected layouts. Asymmetry. Overlap. Diagonal flow. Grid-breaking elements. Generous negative space OR controlled density.
|
||||
- **Backgrounds & Visual Details**: Create atmosphere and depth rather than defaulting to solid colors. Add contextual effects and textures that match the overall aesthetic. Apply creative forms like gradient meshes, noise textures, geometric patterns, layered transparencies, dramatic shadows, decorative borders, custom cursors, and grain overlays.
|
||||
|
||||
NEVER use generic AI-generated aesthetics like overused font families (Inter, Roboto, Arial, system fonts), cliched color schemes (particularly purple gradients on white backgrounds), predictable layouts and component patterns, and cookie-cutter design that lacks context-specific character.
|
||||
|
||||
Interpret creatively and make unexpected choices that feel genuinely designed for the context. No design should be the same. Vary between light and dark themes, different fonts, different aesthetics. NEVER converge on common choices (Space Grotesk, for example) across generations.
|
||||
|
||||
**IMPORTANT**: Match implementation complexity to the aesthetic vision. Maximalist designs need elaborate code with extensive animations and effects. Minimalist or refined designs need restraint, precision, and careful attention to spacing, typography, and subtle details. Elegance comes from executing the vision well.
|
||||
|
||||
Remember: Claude is capable of extraordinary creative work. Don't hold back, show what can truly be created when thinking outside the box and committing fully to a distinctive vision.
|
||||
@@ -0,0 +1,24 @@
|
||||
---
|
||||
description: Retrieve information from the knowledge base (RAG)
|
||||
metadata:
|
||||
nanobot:
|
||||
always: true
|
||||
---
|
||||
|
||||
# Knowledge Base Skill
|
||||
|
||||
You are an expert assistant with access to a knowledge base. You can use the `knowledge_retrieve` tool to find relevant information from documents to answer user questions.
|
||||
|
||||
## When to use this skill
|
||||
- When the user asks questions about specific documents, company policies, technical manuals, or any uploaded knowledge.
|
||||
- When the user's question seems to require domain-specific knowledge that is not in your general training data but likely in the knowledge base.
|
||||
|
||||
## How to use this skill
|
||||
- Call the `knowledge_retrieve` tool with the user's query.
|
||||
- You can adjust `top_k` (default is 5, max 20) if you need more or less context.
|
||||
|
||||
## After using the tool
|
||||
- The tool will return a list of relevant "hits" (document chunks) and optionally an AI-generated answer based on those hits.
|
||||
- Synthesize the information from the hits to provide a comprehensive and accurate answer.
|
||||
- Always cite the sources if provided in the metadata of the hits.
|
||||
- If no relevant information is found, inform the user clearly.
|
||||
@@ -0,0 +1,3 @@
|
||||
obj/
|
||||
bin/
|
||||
*.user
|
||||
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2026 MiniMaxAI
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1,274 @@
|
||||
---
|
||||
name: minimax-docx
|
||||
license: MIT
|
||||
metadata:
|
||||
version: "1.0.0"
|
||||
category: document-processing
|
||||
author: MiniMaxAI
|
||||
sources:
|
||||
- "ECMA-376 Office Open XML File Formats"
|
||||
- "GB/T 9704-2012 Layout Standard for Official Documents"
|
||||
- "IEEE / ACM / APA / MLA / Chicago / Turabian Style Guides"
|
||||
- "Springer LNCS / Nature / HBR Document Templates"
|
||||
description: >
|
||||
Professional DOCX document creation, editing, and formatting using OpenXML SDK (.NET).
|
||||
Three pipelines: (A) create new documents from scratch, (B) fill/edit content in existing
|
||||
documents, (C) apply template formatting with XSD validation gate-check.
|
||||
MUST use this skill whenever the user wants to produce, modify, or format a Word document —
|
||||
including when they say "write a report", "draft a proposal", "make a contract",
|
||||
"fill in this form", "reformat to match this template", or any task whose final output
|
||||
is a .docx file. Even if the user doesn't mention "docx" explicitly, if the task
|
||||
implies a printable/formal document, use this skill.
|
||||
triggers:
|
||||
- Word
|
||||
- docx
|
||||
- document
|
||||
- 文档
|
||||
- Word文档
|
||||
- 报告
|
||||
- 合同
|
||||
- 公文
|
||||
- 排版
|
||||
- 套模板
|
||||
---
|
||||
|
||||
# minimax-docx
|
||||
|
||||
Create, edit, and format DOCX documents via CLI tools or direct C# scripts built on OpenXML SDK (.NET).
|
||||
|
||||
## Setup
|
||||
|
||||
**First time:** `bash scripts/setup.sh` (or `powershell scripts/setup.ps1` on Windows, `--minimal` to skip optional deps).
|
||||
|
||||
**First operation in session:** `scripts/env_check.sh` — do not proceed if `NOT READY`. (Skip on subsequent operations within the same session.)
|
||||
|
||||
## Quick Start: Direct C# Path
|
||||
|
||||
When the task requires structural document manipulation (custom styles, complex tables, multi-section layouts, headers/footers, TOC, images), write C# directly instead of wrestling with CLI limitations. Use this scaffold:
|
||||
|
||||
```csharp
|
||||
// File: scripts/dotnet/task.csx (or a new .cs in a Console project)
|
||||
// dotnet run --project scripts/dotnet/MiniMaxAIDocx.Cli -- run-script task.csx
|
||||
#r "nuget: DocumentFormat.OpenXml, 3.2.0"
|
||||
|
||||
using DocumentFormat.OpenXml;
|
||||
using DocumentFormat.OpenXml.Packaging;
|
||||
using DocumentFormat.OpenXml.Wordprocessing;
|
||||
|
||||
using var doc = WordprocessingDocument.Create("output.docx", WordprocessingDocumentType.Document);
|
||||
var mainPart = doc.AddMainDocumentPart();
|
||||
mainPart.Document = new Document(new Body());
|
||||
|
||||
// --- Your logic here ---
|
||||
// Read the relevant Samples/*.cs file FIRST for tested patterns.
|
||||
// See Samples/ table in References section below.
|
||||
```
|
||||
|
||||
**Before writing any C#, read the relevant `Samples/*.cs` file** — they contain compilable, SDK-version-verified patterns. The Samples table in the References section below maps topics to files.
|
||||
|
||||
## CLI shorthand
|
||||
|
||||
All CLI commands below use `$CLI` as shorthand for:
|
||||
```bash
|
||||
dotnet run --project scripts/dotnet/MiniMaxAIDocx.Cli --
|
||||
```
|
||||
|
||||
## Pipeline routing
|
||||
|
||||
Route by checking: does the user have an input .docx file?
|
||||
|
||||
```
|
||||
User task
|
||||
├─ No input file → Pipeline A: CREATE
|
||||
│ signals: "write", "create", "draft", "generate", "new", "make a report/proposal/memo"
|
||||
│ → Read references/scenario_a_create.md
|
||||
│
|
||||
└─ Has input .docx
|
||||
├─ Replace/fill/modify content → Pipeline B: FILL-EDIT
|
||||
│ signals: "fill in", "replace", "update", "change text", "add section", "edit"
|
||||
│ → Read references/scenario_b_edit_content.md
|
||||
│
|
||||
└─ Reformat/apply style/template → Pipeline C: FORMAT-APPLY
|
||||
signals: "reformat", "apply template", "restyle", "match this format", "套模板", "排版"
|
||||
├─ Template is pure style (no content) → C-1: OVERLAY (apply styles to source)
|
||||
└─ Template has structure (cover/TOC/example sections) → C-2: BASE-REPLACE
|
||||
(use template as base, replace example content with user content)
|
||||
→ Read references/scenario_c_apply_template.md
|
||||
```
|
||||
|
||||
If the request spans multiple pipelines, run them sequentially (e.g., Create then Format-Apply).
|
||||
|
||||
## Pre-processing
|
||||
|
||||
Convert `.doc` → `.docx` if needed: `scripts/doc_to_docx.sh input.doc output_dir/`
|
||||
|
||||
Preview before editing (avoids reading raw XML): `scripts/docx_preview.sh document.docx`
|
||||
|
||||
Analyze structure for editing scenarios: `$CLI analyze --input document.docx`
|
||||
|
||||
## Scenario A: Create
|
||||
|
||||
Read `references/scenario_a_create.md`, `references/typography_guide.md`, and `references/design_principles.md` first. Pick an aesthetic recipe from `Samples/AestheticRecipeSamples.cs` that matches the document type — do not invent formatting values. For CJK, also read `references/cjk_typography.md`.
|
||||
|
||||
**Choose your path:**
|
||||
- **Simple** (plain text, minimal formatting): use CLI — `$CLI create --type report --output out.docx --config content.json`
|
||||
- **Structural** (custom styles, multi-section, TOC, images, complex tables): write C# directly. Read the relevant `Samples/*.cs` first.
|
||||
|
||||
CLI options: `--type` (report|letter|memo|academic), `--title`, `--author`, `--page-size` (letter|a4|legal|a3), `--margins` (standard|narrow|wide), `--header`, `--footer`, `--page-numbers`, `--toc`, `--content-json`.
|
||||
|
||||
Then run the **validation pipeline** (below).
|
||||
|
||||
## Scenario B: Edit / Fill
|
||||
|
||||
Read `references/scenario_b_edit_content.md` first. Preview → analyze → edit → validate.
|
||||
|
||||
**Choose your path:**
|
||||
- **Simple** (text replacement, placeholder fill): use CLI subcommands.
|
||||
- **Structural** (add/reorganize sections, modify styles, manipulate tables, insert images): write C# directly. Read `references/openxml_element_order.md` and the relevant `Samples/*.cs`.
|
||||
|
||||
Available CLI edit subcommands:
|
||||
- `replace-text --find "X" --replace "Y"`
|
||||
- `fill-placeholders --data '{"key":"value"}'`
|
||||
- `fill-table --data table.json`
|
||||
- `insert-section`, `remove-section`, `update-header-footer`
|
||||
|
||||
```bash
|
||||
$CLI edit replace-text --input in.docx --output out.docx --find "OLD" --replace "NEW"
|
||||
$CLI edit fill-placeholders --input in.docx --output out.docx --data '{"name":"John"}'
|
||||
```
|
||||
|
||||
Then run the **validation pipeline**. Also run diff to verify minimal changes:
|
||||
```bash
|
||||
$CLI diff --before in.docx --after out.docx
|
||||
```
|
||||
|
||||
## Scenario C: Apply Template
|
||||
|
||||
Read `references/scenario_c_apply_template.md` first. Preview and analyze both source and template.
|
||||
|
||||
```bash
|
||||
$CLI apply-template --input source.docx --template template.docx --output out.docx
|
||||
```
|
||||
|
||||
For complex template operations (multi-template merge, per-section headers/footers, style merging), write C# directly — see Critical Rules below for required patterns.
|
||||
|
||||
Run the **validation pipeline**, then the **hard gate-check**:
|
||||
```bash
|
||||
$CLI validate --input out.docx --gate-check assets/xsd/business-rules.xsd
|
||||
```
|
||||
Gate-check is a **hard requirement**. Do NOT deliver until it passes. If it fails: diagnose, fix, re-run.
|
||||
|
||||
Also diff to verify content preservation: `$CLI diff --before source.docx --after out.docx`
|
||||
|
||||
## Validation pipeline
|
||||
|
||||
Run after every write operation. For Scenario C the full pipeline is **mandatory**; for A/B it is **recommended** (skip only if the operation was trivially simple).
|
||||
|
||||
```bash
|
||||
$CLI merge-runs --input doc.docx # 1. consolidate runs
|
||||
$CLI validate --input doc.docx --xsd assets/xsd/wml-subset.xsd # 2. XSD structure
|
||||
$CLI validate --input doc.docx --business # 3. business rules
|
||||
```
|
||||
|
||||
If XSD fails, auto-repair and retry:
|
||||
```bash
|
||||
$CLI fix-order --input doc.docx
|
||||
$CLI validate --input doc.docx --xsd assets/xsd/wml-subset.xsd
|
||||
```
|
||||
|
||||
If XSD still fails, fall back to business rules + preview:
|
||||
```bash
|
||||
$CLI validate --input doc.docx --business
|
||||
scripts/docx_preview.sh doc.docx
|
||||
# Verify: font contamination=0, table count correct, drawing count correct, sectPr count correct
|
||||
```
|
||||
|
||||
Final preview: `scripts/docx_preview.sh doc.docx`
|
||||
|
||||
## Critical rules
|
||||
|
||||
These prevent file corruption — OpenXML is strict about element ordering.
|
||||
|
||||
**Element order** (properties always first):
|
||||
|
||||
| Parent | Order |
|
||||
|--------|-------|
|
||||
| `w:p` | `pPr` → runs |
|
||||
| `w:r` | `rPr` → `t`/`br`/`tab` |
|
||||
| `w:tbl`| `tblPr` → `tblGrid` → `tr` |
|
||||
| `w:tr` | `trPr` → `tc` |
|
||||
| `w:tc` | `tcPr` → `p` (min 1 `<w:p/>`) |
|
||||
| `w:body` | block content → `sectPr` (LAST child) |
|
||||
|
||||
**Direct format contamination:** When copying content from a source document, inline `rPr` (fonts, color) and `pPr` (borders, shading, spacing) override template styles. Always strip direct formatting — keep only `pStyle` reference and `t` text. Clean tables too (including `pPr/rPr` inside cells).
|
||||
|
||||
**Track changes:** `<w:del>` uses `<w:delText>`, never `<w:t>`. `<w:ins>` uses `<w:t>`, never `<w:delText>`.
|
||||
|
||||
**Font size:** `w:sz` = points × 2 (12pt → `sz="24"`). Margins/spacing in DXA (1 inch = 1440, 1cm ≈ 567).
|
||||
|
||||
**Heading styles MUST have OutlineLevel:** When defining heading styles (Heading1, ThesisH1, etc.), always include `new OutlineLevel { Val = N }` in `StyleParagraphProperties` (H1→0, H2→1, H3→2). Without this, Word sees them as plain styled text — TOC and navigation pane won't work.
|
||||
|
||||
**Multi-template merge:** When given multiple template files (font, heading, breaks), read `references/scenario_c_apply_template.md` section "Multi-Template Merge" FIRST. Key rules:
|
||||
- Merge styles from all templates into one styles.xml. Structure (sections/breaks) comes from the breaks template.
|
||||
- Each content paragraph must appear exactly ONCE — never duplicate when inserting section breaks.
|
||||
- NEVER insert empty/blank paragraphs as padding or section separators. Output paragraph count must equal input. Use section break properties (`w:sectPr` inside `w:pPr`) and style spacing (`w:spacing` before/after) for visual separation.
|
||||
- Insert oddPage section breaks before EVERY chapter heading, not just the first. Even if a chapter has dual-column content, it MUST start with oddPage; use a second continuous break after the heading for column switching.
|
||||
- Dual-column chapters need THREE section breaks: (1) oddPage in preceding para's pPr, (2) continuous+cols=2 in the chapter HEADING's pPr, (3) continuous+cols=1 in the last body para's pPr to revert.
|
||||
- Copy `titlePg` settings from the breaks template for EACH section. Abstract and TOC sections typically need `titlePg=true`.
|
||||
|
||||
**Multi-section headers/footers:** Templates with 10+ sections (e.g., Chinese thesis) have DIFFERENT headers/footers per section (Roman vs Arabic page numbers, different header text per zone). Rules:
|
||||
- Use C-2 Base-Replace: copy the TEMPLATE as output base, then replace body content. This preserves all sections, headers, footers, and titlePg settings automatically.
|
||||
- NEVER recreate headers/footers from scratch — copy template header/footer XML byte-for-byte.
|
||||
- NEVER add formatting (borders, alignment, font size) not present in the template header XML.
|
||||
- Non-cover sections MUST have header/footer XML files (at least empty header + page number footer).
|
||||
- See `references/scenario_c_apply_template.md` section "Multi-Section Header/Footer Transfer".
|
||||
|
||||
## References
|
||||
|
||||
Load as needed — don't load all at once. Pick the most relevant files for the task.
|
||||
|
||||
**The C# samples and design references below are the project's knowledge base ("encyclopedia").** When writing OpenXML code, ALWAYS read the relevant sample file first — it contains compilable, SDK-version-verified patterns that prevent common errors. When making aesthetic decisions, read the design principles and recipe files — they encode tested, harmonious parameter sets from authoritative sources (IEEE, ACM, APA, Nature, etc.), not guesses.
|
||||
|
||||
### Scenario guides (read first for each pipeline)
|
||||
|
||||
| File | When |
|
||||
|------|------|
|
||||
| `references/scenario_a_create.md` | Pipeline A: creating from scratch |
|
||||
| `references/scenario_b_edit_content.md` | Pipeline B: editing existing content |
|
||||
| `references/scenario_c_apply_template.md` | Pipeline C: applying template formatting |
|
||||
|
||||
### C# code samples (compilable, heavily commented — read when writing code)
|
||||
|
||||
| File | Topic |
|
||||
|------|-------|
|
||||
| `Samples/DocumentCreationSamples.cs` | Document lifecycle: create, open, save, streams, doc defaults, settings, properties, page setup, multi-section |
|
||||
| `Samples/StyleSystemSamples.cs` | Styles: Normal/Heading chain, character/table/list styles, DocDefaults, latentStyles, CJK 公文, APA 7th, import, resolve inheritance |
|
||||
| `Samples/CharacterFormattingSamples.cs` | RunProperties: fonts, size, bold/italic, all underlines, color, highlight, strike, sub/super, caps, spacing, shading, border, emphasis marks |
|
||||
| `Samples/ParagraphFormattingSamples.cs` | ParagraphProperties: justification, indentation, line/paragraph spacing, keep/widow, outline level, borders, tabs, numbering, bidi, frame |
|
||||
| `Samples/TableSamples.cs` | Tables: borders, grid, cell props, margins, row height, header repeat, merge (H+V), nested, floating, three-line 三线表, zebra striping |
|
||||
| `Samples/HeaderFooterSamples.cs` | Headers/footers: page numbers, "Page X of Y", first/even/odd, logo image, table layout, 公文 "-X-", per-section |
|
||||
| `Samples/ImageSamples.cs` | Images: inline, floating, text wrapping, border, alt text, in header/table, replace, SVG fallback, dimension calc |
|
||||
| `Samples/ListAndNumberingSamples.cs` | Numbering: bullets, multi-level decimal, custom symbols, outline→headings, legal, Chinese 一/(一)/1./(1), restart/continue |
|
||||
| `Samples/FieldAndTocSamples.cs` | Fields: TOC, SimpleField vs complex field, DATE/PAGE/REF/SEQ/MERGEFIELD/IF/STYLEREF, TOC styles |
|
||||
| `Samples/FootnoteAndCommentSamples.cs` | Footnotes, endnotes, comments (4-file system), bookmarks, hyperlinks (internal + external) |
|
||||
| `Samples/TrackChangesSamples.cs` | Revisions: insertions (w:t), deletions (w:delText!), formatting changes, accept/reject all, move tracking |
|
||||
| `Samples/AestheticRecipeSamples.cs` | 13 aesthetic recipes from authoritative sources: ModernCorporate, AcademicThesis, ExecutiveBrief, ChineseGovernment (GB/T 9704), MinimalModern, IEEE Conference, ACM sigconf, APA 7th, MLA 9th, Chicago/Turabian, Springer LNCS, Nature, HBR — each with exact values from official style guides |
|
||||
|
||||
Note: `Samples/` path is relative to `scripts/dotnet/MiniMaxAIDocx.Core/`.
|
||||
|
||||
### Markdown references (read when you need specifications or design rules)
|
||||
|
||||
| File | When |
|
||||
|------|------|
|
||||
| `references/openxml_element_order.md` | XML element ordering rules (prevents corruption) |
|
||||
| `references/openxml_units.md` | Unit conversion: DXA, EMU, half-points, eighth-points |
|
||||
| `references/openxml_encyclopedia_part1.md` | Detailed C# encyclopedia: document creation, styles, character & paragraph formatting |
|
||||
| `references/openxml_encyclopedia_part2.md` | Detailed C# encyclopedia: page setup, tables, headers/footers, sections, doc properties |
|
||||
| `references/openxml_encyclopedia_part3.md` | Detailed C# encyclopedia: TOC, footnotes, fields, track changes, comments, images, math, numbering, protection |
|
||||
| `references/typography_guide.md` | Font pairing, sizes, spacing, page layout, table design, color schemes |
|
||||
| `references/cjk_typography.md` | CJK fonts, 字号 sizes, RunFonts mapping, GB/T 9704 公文 standard |
|
||||
| `references/cjk_university_template_guide.md` | Chinese university thesis templates: numeric styleIds (1/2/3 vs Heading1), document zone structure (cover→abstract→TOC→body→references), font expectations, common mistakes |
|
||||
| `references/design_principles.md` | **Aesthetic foundations**: 6 design principles (white space, contrast/scale, proximity, alignment, repetition, hierarchy) — teaches WHY, not just WHAT |
|
||||
| `references/design_good_bad_examples.md` | **Good vs Bad comparisons**: 10 categories of typography mistakes with OpenXML values, ASCII mockups, and fixes |
|
||||
| `references/track_changes_guide.md` | Revision marks deep dive |
|
||||
| `references/troubleshooting.md` | **Symptom-driven fixes**: 13 common problems indexed by what you SEE (headings wrong, images missing, TOC broken, etc.) — search by symptom, find the fix |
|
||||
@@ -0,0 +1,250 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006">
|
||||
|
||||
<!-- Document Defaults -->
|
||||
<w:docDefaults>
|
||||
<w:rPrDefault>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Times New Roman" w:hAnsi="Times New Roman" w:eastAsia="SimSun" w:cs="Times New Roman" />
|
||||
<w:sz w:val="24" />
|
||||
<w:szCs w:val="24" />
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN" w:bidi="ar-SA" />
|
||||
</w:rPr>
|
||||
</w:rPrDefault>
|
||||
<w:pPrDefault>
|
||||
<w:pPr>
|
||||
<w:spacing w:after="0" w:line="480" w:lineRule="auto" />
|
||||
</w:pPr>
|
||||
</w:pPrDefault>
|
||||
</w:docDefaults>
|
||||
|
||||
<w:latentStyles w:defLockedState="0" w:defUIPriority="99" w:defSemiHidden="0" w:defUnhideWhenUsed="0" w:defQFormat="0" w:count="376" />
|
||||
|
||||
<!-- Normal — Times New Roman 12pt, double spaced, first line indent -->
|
||||
<w:style w:type="paragraph" w:default="1" w:styleId="Normal">
|
||||
<w:name w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="0" w:line="480" w:lineRule="auto" />
|
||||
<w:ind w:firstLine="720" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Times New Roman" w:hAnsi="Times New Roman" />
|
||||
<w:sz w:val="24" />
|
||||
<w:szCs w:val="24" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Default Paragraph Font -->
|
||||
<w:style w:type="character" w:default="1" w:styleId="DefaultParagraphFont">
|
||||
<w:name w:val="Default Paragraph Font" />
|
||||
<w:uiPriority w:val="1" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 1 — Bold, 14pt, no color, no indent -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading1">
|
||||
<w:name w:val="heading 1" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="480" w:after="240" w:line="480" w:lineRule="auto" />
|
||||
<w:ind w:firstLine="0" />
|
||||
<w:jc w:val="center" />
|
||||
<w:outlineLvl w:val="0" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:b />
|
||||
<w:sz w:val="28" />
|
||||
<w:szCs w:val="28" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 2 — Bold, 13pt -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading2">
|
||||
<w:name w:val="heading 2" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="360" w:after="120" w:line="480" w:lineRule="auto" />
|
||||
<w:ind w:firstLine="0" />
|
||||
<w:outlineLvl w:val="1" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:b />
|
||||
<w:sz w:val="26" />
|
||||
<w:szCs w:val="26" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 3 — Bold, 12pt -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading3">
|
||||
<w:name w:val="heading 3" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="240" w:after="80" w:line="480" w:lineRule="auto" />
|
||||
<w:ind w:firstLine="0" />
|
||||
<w:outlineLvl w:val="2" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:b />
|
||||
<w:sz w:val="24" />
|
||||
<w:szCs w:val="24" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Title — Centered, bold, 14pt (academic title page) -->
|
||||
<w:style w:type="paragraph" w:styleId="Title">
|
||||
<w:name w:val="Title" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="10" />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="480" w:line="480" w:lineRule="auto" />
|
||||
<w:ind w:firstLine="0" />
|
||||
<w:jc w:val="center" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:b />
|
||||
<w:sz w:val="28" />
|
||||
<w:szCs w:val="28" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Subtitle -->
|
||||
<w:style w:type="paragraph" w:styleId="Subtitle">
|
||||
<w:name w:val="Subtitle" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="11" />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="240" w:line="480" w:lineRule="auto" />
|
||||
<w:ind w:firstLine="0" />
|
||||
<w:jc w:val="center" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:sz w:val="24" />
|
||||
<w:szCs w:val="24" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Quote — Block quote, indented 0.5 inch on both sides -->
|
||||
<w:style w:type="paragraph" w:styleId="Quote">
|
||||
<w:name w:val="Quote" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="29" />
|
||||
<w:pPr>
|
||||
<w:spacing w:before="240" w:after="240" w:line="480" w:lineRule="auto" />
|
||||
<w:ind w:left="720" w:right="720" w:firstLine="0" />
|
||||
</w:pPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Table Normal -->
|
||||
<w:style w:type="table" w:default="1" w:styleId="TableNormal">
|
||||
<w:name w:val="Normal Table" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:tblPr>
|
||||
<w:tblInd w:w="0" w:type="dxa" />
|
||||
<w:tblCellMar>
|
||||
<w:top w:w="0" w:type="dxa" />
|
||||
<w:left w:w="108" w:type="dxa" />
|
||||
<w:bottom w:w="0" w:type="dxa" />
|
||||
<w:right w:w="108" w:type="dxa" />
|
||||
</w:tblCellMar>
|
||||
</w:tblPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Table Grid — Simple borders, no color -->
|
||||
<w:style w:type="table" w:styleId="TableGrid">
|
||||
<w:name w:val="Table Grid" />
|
||||
<w:basedOn w:val="TableNormal" />
|
||||
<w:uiPriority w:val="39" />
|
||||
<w:tblPr>
|
||||
<w:tblBorders>
|
||||
<w:top w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
<w:left w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
<w:bottom w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
<w:right w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
<w:insideH w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
<w:insideV w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
</w:tblBorders>
|
||||
</w:tblPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Header -->
|
||||
<w:style w:type="paragraph" w:styleId="Header">
|
||||
<w:name w:val="header" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:pPr>
|
||||
<w:tabs>
|
||||
<w:tab w:val="center" w:pos="4680" />
|
||||
<w:tab w:val="right" w:pos="9360" />
|
||||
</w:tabs>
|
||||
<w:spacing w:after="0" w:line="240" w:lineRule="auto" />
|
||||
<w:ind w:firstLine="0" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:sz w:val="24" />
|
||||
<w:szCs w:val="24" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Footer -->
|
||||
<w:style w:type="paragraph" w:styleId="Footer">
|
||||
<w:name w:val="footer" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:pPr>
|
||||
<w:tabs>
|
||||
<w:tab w:val="center" w:pos="4680" />
|
||||
<w:tab w:val="right" w:pos="9360" />
|
||||
</w:tabs>
|
||||
<w:spacing w:after="0" w:line="240" w:lineRule="auto" />
|
||||
<w:ind w:firstLine="0" />
|
||||
<w:jc w:val="center" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:sz w:val="24" />
|
||||
<w:szCs w:val="24" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Hyperlink -->
|
||||
<w:style w:type="character" w:styleId="Hyperlink">
|
||||
<w:name w:val="Hyperlink" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:unhideWhenUsed />
|
||||
<w:rPr>
|
||||
<w:color w:val="0563C1" />
|
||||
<w:u w:val="single" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
</w:styles>
|
||||
@@ -0,0 +1,284 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006">
|
||||
|
||||
<!-- Document Defaults -->
|
||||
<w:docDefaults>
|
||||
<w:rPrDefault>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" w:eastAsia="Microsoft YaHei" w:cs="Arial" />
|
||||
<w:color w:val="333333" />
|
||||
<w:sz w:val="22" />
|
||||
<w:szCs w:val="22" />
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN" w:bidi="ar-SA" />
|
||||
</w:rPr>
|
||||
</w:rPrDefault>
|
||||
<w:pPrDefault>
|
||||
<w:pPr>
|
||||
<w:spacing w:after="160" w:line="259" w:lineRule="auto" />
|
||||
</w:pPr>
|
||||
</w:pPrDefault>
|
||||
</w:docDefaults>
|
||||
|
||||
<w:latentStyles w:defLockedState="0" w:defUIPriority="99" w:defSemiHidden="0" w:defUnhideWhenUsed="0" w:defQFormat="0" w:count="376" />
|
||||
|
||||
<!-- Normal -->
|
||||
<w:style w:type="paragraph" w:default="1" w:styleId="Normal">
|
||||
<w:name w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="160" w:line="240" w:lineRule="auto" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" />
|
||||
<w:color w:val="333333" />
|
||||
<w:sz w:val="22" />
|
||||
<w:szCs w:val="22" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Default Paragraph Font -->
|
||||
<w:style w:type="character" w:default="1" w:styleId="DefaultParagraphFont">
|
||||
<w:name w:val="Default Paragraph Font" />
|
||||
<w:uiPriority w:val="1" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 1 — Dark Blue -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading1">
|
||||
<w:name w:val="heading 1" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="480" w:after="240" w:line="240" w:lineRule="auto" />
|
||||
<w:outlineLvl w:val="0" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:b />
|
||||
<w:color w:val="1F3864" />
|
||||
<w:sz w:val="56" />
|
||||
<w:szCs w:val="56" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 2 -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading2">
|
||||
<w:name w:val="heading 2" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="360" w:after="120" w:line="240" w:lineRule="auto" />
|
||||
<w:outlineLvl w:val="1" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:b />
|
||||
<w:color w:val="1F3864" />
|
||||
<w:sz w:val="48" />
|
||||
<w:szCs w:val="48" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 3 -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading3">
|
||||
<w:name w:val="heading 3" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="240" w:after="80" w:line="240" w:lineRule="auto" />
|
||||
<w:outlineLvl w:val="2" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:b />
|
||||
<w:color w:val="1F3864" />
|
||||
<w:sz w:val="36" />
|
||||
<w:szCs w:val="36" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 4 -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading4">
|
||||
<w:name w:val="heading 4" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="160" w:after="80" w:line="240" w:lineRule="auto" />
|
||||
<w:outlineLvl w:val="3" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:b />
|
||||
<w:i />
|
||||
<w:color w:val="1F3864" />
|
||||
<w:sz w:val="28" />
|
||||
<w:szCs w:val="28" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Title -->
|
||||
<w:style w:type="paragraph" w:styleId="Title">
|
||||
<w:name w:val="Title" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="10" />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="240" w:line="240" w:lineRule="auto" />
|
||||
<w:jc w:val="center" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:color w:val="1F3864" />
|
||||
<w:sz w:val="72" />
|
||||
<w:szCs w:val="72" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Subtitle -->
|
||||
<w:style w:type="paragraph" w:styleId="Subtitle">
|
||||
<w:name w:val="Subtitle" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="11" />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="360" w:line="240" w:lineRule="auto" />
|
||||
<w:jc w:val="center" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:i />
|
||||
<w:color w:val="595959" />
|
||||
<w:sz w:val="32" />
|
||||
<w:szCs w:val="32" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Table Grid — Corporate with blue header -->
|
||||
<w:style w:type="table" w:default="1" w:styleId="TableNormal">
|
||||
<w:name w:val="Normal Table" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:tblPr>
|
||||
<w:tblInd w:w="0" w:type="dxa" />
|
||||
<w:tblCellMar>
|
||||
<w:top w:w="0" w:type="dxa" />
|
||||
<w:left w:w="108" w:type="dxa" />
|
||||
<w:bottom w:w="0" w:type="dxa" />
|
||||
<w:right w:w="108" w:type="dxa" />
|
||||
</w:tblCellMar>
|
||||
</w:tblPr>
|
||||
</w:style>
|
||||
|
||||
<w:style w:type="table" w:styleId="TableGrid">
|
||||
<w:name w:val="Table Grid" />
|
||||
<w:basedOn w:val="TableNormal" />
|
||||
<w:uiPriority w:val="39" />
|
||||
<w:tblPr>
|
||||
<w:tblBorders>
|
||||
<w:top w:val="single" w:sz="4" w:space="0" w:color="BFBFBF" />
|
||||
<w:left w:val="single" w:sz="4" w:space="0" w:color="BFBFBF" />
|
||||
<w:bottom w:val="single" w:sz="4" w:space="0" w:color="BFBFBF" />
|
||||
<w:right w:val="single" w:sz="4" w:space="0" w:color="BFBFBF" />
|
||||
<w:insideH w:val="single" w:sz="4" w:space="0" w:color="BFBFBF" />
|
||||
<w:insideV w:val="single" w:sz="4" w:space="0" w:color="BFBFBF" />
|
||||
</w:tblBorders>
|
||||
</w:tblPr>
|
||||
<w:tblStylePr w:type="firstRow">
|
||||
<w:rPr>
|
||||
<w:b />
|
||||
<w:color w:val="FFFFFF" />
|
||||
</w:rPr>
|
||||
<w:tcPr>
|
||||
<w:shd w:val="clear" w:color="auto" w:fill="2F5496" />
|
||||
<w:tcBorders>
|
||||
<w:top w:val="single" w:sz="4" w:space="0" w:color="2F5496" />
|
||||
<w:left w:val="single" w:sz="4" w:space="0" w:color="2F5496" />
|
||||
<w:bottom w:val="single" w:sz="4" w:space="0" w:color="2F5496" />
|
||||
<w:right w:val="single" w:sz="4" w:space="0" w:color="2F5496" />
|
||||
<w:insideH w:val="single" w:sz="4" w:space="0" w:color="3A6BC5" />
|
||||
<w:insideV w:val="single" w:sz="4" w:space="0" w:color="3A6BC5" />
|
||||
</w:tcBorders>
|
||||
</w:tcPr>
|
||||
</w:tblStylePr>
|
||||
<w:tblStylePr w:type="band1Horz">
|
||||
<w:tcPr>
|
||||
<w:shd w:val="clear" w:color="auto" w:fill="D9E2F3" />
|
||||
</w:tcPr>
|
||||
</w:tblStylePr>
|
||||
</w:style>
|
||||
|
||||
<!-- Header -->
|
||||
<w:style w:type="paragraph" w:styleId="Header">
|
||||
<w:name w:val="header" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:pPr>
|
||||
<w:tabs>
|
||||
<w:tab w:val="center" w:pos="4680" />
|
||||
<w:tab w:val="right" w:pos="9360" />
|
||||
</w:tabs>
|
||||
<w:spacing w:after="0" w:line="240" w:lineRule="auto" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:sz w:val="18" />
|
||||
<w:szCs w:val="18" />
|
||||
<w:color w:val="808080" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Footer -->
|
||||
<w:style w:type="paragraph" w:styleId="Footer">
|
||||
<w:name w:val="footer" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:pPr>
|
||||
<w:tabs>
|
||||
<w:tab w:val="center" w:pos="4680" />
|
||||
<w:tab w:val="right" w:pos="9360" />
|
||||
</w:tabs>
|
||||
<w:spacing w:after="0" w:line="240" w:lineRule="auto" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:sz w:val="18" />
|
||||
<w:szCs w:val="18" />
|
||||
<w:color w:val="808080" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Hyperlink -->
|
||||
<w:style w:type="character" w:styleId="Hyperlink">
|
||||
<w:name w:val="Hyperlink" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:unhideWhenUsed />
|
||||
<w:rPr>
|
||||
<w:color w:val="0563C1" />
|
||||
<w:u w:val="single" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
</w:styles>
|
||||
@@ -0,0 +1,449 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006">
|
||||
|
||||
<!-- Document Defaults -->
|
||||
<w:docDefaults>
|
||||
<w:rPrDefault>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" w:eastAsia="SimSun" w:cs="Arial" />
|
||||
<w:sz w:val="22" />
|
||||
<w:szCs w:val="22" />
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN" w:bidi="ar-SA" />
|
||||
</w:rPr>
|
||||
</w:rPrDefault>
|
||||
<w:pPrDefault>
|
||||
<w:pPr>
|
||||
<w:spacing w:after="160" w:line="259" w:lineRule="auto" />
|
||||
</w:pPr>
|
||||
</w:pPrDefault>
|
||||
</w:docDefaults>
|
||||
|
||||
<!-- Latent Styles -->
|
||||
<w:latentStyles w:defLockedState="0" w:defUIPriority="99" w:defSemiHidden="0" w:defUnhideWhenUsed="0" w:defQFormat="0" w:count="376" />
|
||||
|
||||
<!-- Normal (Default Paragraph Style) -->
|
||||
<w:style w:type="paragraph" w:default="1" w:styleId="Normal">
|
||||
<w:name w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="160" w:line="240" w:lineRule="auto" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" />
|
||||
<w:sz w:val="22" />
|
||||
<w:szCs w:val="22" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Default Paragraph Font -->
|
||||
<w:style w:type="character" w:default="1" w:styleId="DefaultParagraphFont">
|
||||
<w:name w:val="Default Paragraph Font" />
|
||||
<w:uiPriority w:val="1" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 1 -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading1">
|
||||
<w:name w:val="heading 1" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="480" w:after="240" w:line="240" w:lineRule="auto" />
|
||||
<w:outlineLvl w:val="0" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:b />
|
||||
<w:color w:val="2F5496" />
|
||||
<w:sz w:val="56" />
|
||||
<w:szCs w:val="56" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 2 -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading2">
|
||||
<w:name w:val="heading 2" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="360" w:after="120" w:line="240" w:lineRule="auto" />
|
||||
<w:outlineLvl w:val="1" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:b />
|
||||
<w:color w:val="2F5496" />
|
||||
<w:sz w:val="48" />
|
||||
<w:szCs w:val="48" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 3 -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading3">
|
||||
<w:name w:val="heading 3" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="240" w:after="80" w:line="240" w:lineRule="auto" />
|
||||
<w:outlineLvl w:val="2" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:b />
|
||||
<w:color w:val="2F5496" />
|
||||
<w:sz w:val="36" />
|
||||
<w:szCs w:val="36" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 4 -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading4">
|
||||
<w:name w:val="heading 4" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="160" w:after="80" w:line="240" w:lineRule="auto" />
|
||||
<w:outlineLvl w:val="3" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:b />
|
||||
<w:i />
|
||||
<w:color w:val="2F5496" />
|
||||
<w:sz w:val="28" />
|
||||
<w:szCs w:val="28" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 5 -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading5">
|
||||
<w:name w:val="heading 5" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="160" w:after="80" w:line="240" w:lineRule="auto" />
|
||||
<w:outlineLvl w:val="4" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:b />
|
||||
<w:color w:val="2F5496" />
|
||||
<w:sz w:val="24" />
|
||||
<w:szCs w:val="24" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Heading 6 -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading6">
|
||||
<w:name w:val="heading 6" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="9" />
|
||||
<w:pPr>
|
||||
<w:keepNext />
|
||||
<w:keepLines />
|
||||
<w:spacing w:before="160" w:after="80" w:line="240" w:lineRule="auto" />
|
||||
<w:outlineLvl w:val="5" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:b />
|
||||
<w:i />
|
||||
<w:color w:val="2F5496" />
|
||||
<w:sz w:val="22" />
|
||||
<w:szCs w:val="22" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Title -->
|
||||
<w:style w:type="paragraph" w:styleId="Title">
|
||||
<w:name w:val="Title" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="10" />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="240" w:line="240" w:lineRule="auto" />
|
||||
<w:jc w:val="center" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light" w:hAnsi="Calibri Light" />
|
||||
<w:color w:val="2F5496" />
|
||||
<w:sz w:val="72" />
|
||||
<w:szCs w:val="72" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Subtitle -->
|
||||
<w:style w:type="paragraph" w:styleId="Subtitle">
|
||||
<w:name w:val="Subtitle" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="11" />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="360" w:line="240" w:lineRule="auto" />
|
||||
<w:jc w:val="center" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:i />
|
||||
<w:color w:val="595959" />
|
||||
<w:sz w:val="32" />
|
||||
<w:szCs w:val="32" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Quote -->
|
||||
<w:style w:type="paragraph" w:styleId="Quote">
|
||||
<w:name w:val="Quote" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="29" />
|
||||
<w:pPr>
|
||||
<w:spacing w:before="240" w:after="240" />
|
||||
<w:ind w:left="720" w:right="720" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:i />
|
||||
<w:color w:val="404040" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Intense Quote -->
|
||||
<w:style w:type="paragraph" w:styleId="IntenseQuote">
|
||||
<w:name w:val="Intense Quote" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:qFormat />
|
||||
<w:uiPriority w:val="30" />
|
||||
<w:pPr>
|
||||
<w:spacing w:before="240" w:after="240" />
|
||||
<w:ind w:left="720" w:right="720" />
|
||||
<w:pBdr>
|
||||
<w:left w:val="single" w:sz="18" w:space="12" w:color="2F5496" />
|
||||
</w:pBdr>
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:b />
|
||||
<w:i />
|
||||
<w:color w:val="2F5496" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- TOC Heading -->
|
||||
<w:style w:type="paragraph" w:styleId="TOCHeading">
|
||||
<w:name w:val="TOC Heading" />
|
||||
<w:basedOn w:val="Heading1" />
|
||||
<w:next w:val="Normal" />
|
||||
<w:uiPriority w:val="39" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:pPr>
|
||||
<w:outlineLvl w:val="9" />
|
||||
</w:pPr>
|
||||
</w:style>
|
||||
|
||||
<!-- TOC 1 -->
|
||||
<w:style w:type="paragraph" w:styleId="TOC1">
|
||||
<w:name w:val="toc 1" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:uiPriority w:val="39" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:pPr>
|
||||
<w:spacing w:before="120" w:after="0" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:b />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- TOC 2 -->
|
||||
<w:style w:type="paragraph" w:styleId="TOC2">
|
||||
<w:name w:val="toc 2" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:uiPriority w:val="39" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="0" />
|
||||
<w:ind w:left="240" />
|
||||
</w:pPr>
|
||||
</w:style>
|
||||
|
||||
<!-- TOC 3 -->
|
||||
<w:style w:type="paragraph" w:styleId="TOC3">
|
||||
<w:name w:val="toc 3" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:uiPriority w:val="39" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="0" />
|
||||
<w:ind w:left="480" />
|
||||
</w:pPr>
|
||||
</w:style>
|
||||
|
||||
<!-- List Bullet -->
|
||||
<w:style w:type="paragraph" w:styleId="ListBullet">
|
||||
<w:name w:val="List Bullet" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:uiPriority w:val="36" />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="0" />
|
||||
<w:ind w:left="720" w:hanging="360" />
|
||||
<w:contextualSpacing />
|
||||
</w:pPr>
|
||||
</w:style>
|
||||
|
||||
<!-- List Number -->
|
||||
<w:style w:type="paragraph" w:styleId="ListNumber">
|
||||
<w:name w:val="List Number" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:uiPriority w:val="36" />
|
||||
<w:pPr>
|
||||
<w:spacing w:after="0" />
|
||||
<w:ind w:left="720" w:hanging="360" />
|
||||
<w:contextualSpacing />
|
||||
</w:pPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Table Normal -->
|
||||
<w:style w:type="table" w:default="1" w:styleId="TableNormal">
|
||||
<w:name w:val="Normal Table" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:tblPr>
|
||||
<w:tblInd w:w="0" w:type="dxa" />
|
||||
<w:tblCellMar>
|
||||
<w:top w:w="0" w:type="dxa" />
|
||||
<w:left w:w="108" w:type="dxa" />
|
||||
<w:bottom w:w="0" w:type="dxa" />
|
||||
<w:right w:w="108" w:type="dxa" />
|
||||
</w:tblCellMar>
|
||||
</w:tblPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Table Grid -->
|
||||
<w:style w:type="table" w:styleId="TableGrid">
|
||||
<w:name w:val="Table Grid" />
|
||||
<w:basedOn w:val="TableNormal" />
|
||||
<w:uiPriority w:val="39" />
|
||||
<w:tblPr>
|
||||
<w:tblBorders>
|
||||
<w:top w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
<w:left w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
<w:bottom w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
<w:right w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
<w:insideH w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
<w:insideV w:val="single" w:sz="4" w:space="0" w:color="auto" />
|
||||
</w:tblBorders>
|
||||
</w:tblPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Header -->
|
||||
<w:style w:type="paragraph" w:styleId="Header">
|
||||
<w:name w:val="header" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:pPr>
|
||||
<w:tabs>
|
||||
<w:tab w:val="center" w:pos="4680" />
|
||||
<w:tab w:val="right" w:pos="9360" />
|
||||
</w:tabs>
|
||||
<w:spacing w:after="0" w:line="240" w:lineRule="auto" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:sz w:val="18" />
|
||||
<w:szCs w:val="18" />
|
||||
<w:color w:val="808080" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Footer -->
|
||||
<w:style w:type="paragraph" w:styleId="Footer">
|
||||
<w:name w:val="footer" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:pPr>
|
||||
<w:tabs>
|
||||
<w:tab w:val="center" w:pos="4680" />
|
||||
<w:tab w:val="right" w:pos="9360" />
|
||||
</w:tabs>
|
||||
<w:spacing w:after="0" w:line="240" w:lineRule="auto" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:sz w:val="18" />
|
||||
<w:szCs w:val="18" />
|
||||
<w:color w:val="808080" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Hyperlink -->
|
||||
<w:style w:type="character" w:styleId="Hyperlink">
|
||||
<w:name w:val="Hyperlink" />
|
||||
<w:uiPriority w:val="99" />
|
||||
<w:unhideWhenUsed />
|
||||
<w:rPr>
|
||||
<w:color w:val="0563C1" />
|
||||
<w:u w:val="single" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Comment Text -->
|
||||
<w:style w:type="paragraph" w:styleId="CommentText">
|
||||
<w:name w:val="annotation text" />
|
||||
<w:basedOn w:val="Normal" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:pPr>
|
||||
<w:spacing w:line="240" w:lineRule="auto" />
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:sz w:val="20" />
|
||||
<w:szCs w:val="20" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Comment Reference -->
|
||||
<w:style w:type="character" w:styleId="CommentReference">
|
||||
<w:name w:val="annotation reference" />
|
||||
<w:semiHidden />
|
||||
<w:unhideWhenUsed />
|
||||
<w:rPr>
|
||||
<w:sz w:val="16" />
|
||||
<w:szCs w:val="16" />
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
</w:styles>
|
||||
@@ -0,0 +1,470 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!-- ============================================================================ -->
|
||||
<!-- Aesthetic Rules Schema for minimax-docx -->
|
||||
<!-- ============================================================================ -->
|
||||
<!-- Purpose: Validates whether a document follows basic aesthetic rules that -->
|
||||
<!-- produce visually harmonious results. This is a "taste checker" that flags -->
|
||||
<!-- common ugly patterns. -->
|
||||
<!-- -->
|
||||
<!-- IMPORTANT: XSD validates STRUCTURE and VALUE RANGES, not SEMANTICS. -->
|
||||
<!-- Many aesthetic rules require cross-element comparison (e.g., "H1 must be -->
|
||||
<!-- larger than H2") which XSD cannot express. These rules are documented in -->
|
||||
<!-- comments and must be enforced by a programmatic validator. -->
|
||||
<!-- -->
|
||||
<!-- Rules that CAN be expressed in XSD: -->
|
||||
<!-- - Font size ranges (body 10-14pt, headings 10-26pt) -->
|
||||
<!-- - Line spacing ranges (1.0x to 2.33x) -->
|
||||
<!-- - Margin minimums (at least 0.5in on all sides) -->
|
||||
<!-- - Table cell padding minimums -->
|
||||
<!-- -->
|
||||
<!-- Rules that CANNOT be expressed in XSD (enforce programmatically): -->
|
||||
<!-- - H1 sz > H2 sz > H3 sz > body sz (hierarchy) -->
|
||||
<!-- - Maximum 3 font families across all styles -->
|
||||
<!-- - Heading space-before >= space-after -->
|
||||
<!-- - Color contrast ratio between text and background -->
|
||||
<!-- - Consistent font family within heading vs body groups -->
|
||||
<!-- - Line spacing and font size harmony (larger text needs tighter spacing) -->
|
||||
<!-- -->
|
||||
<!-- MIT License - minimax-docx project -->
|
||||
<!-- ============================================================================ -->
|
||||
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
||||
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
targetNamespace="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
elementFormDefault="qualified">
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- RULE 1: Body Font Size Range -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- Body text must be 10-14pt (half-points: 20-28). -->
|
||||
<!-- WHY: Below 10pt is hard to read for most adults. -->
|
||||
<!-- Above 14pt body text looks childish or wasteful. -->
|
||||
<!-- The sweet spot is 10.5-12pt for most font families. -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_AestheticBodyFontSize">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Body text font size in half-points.
|
||||
Acceptable range: 20-28 (10pt-14pt).
|
||||
- 10pt (20): minimum for comfortable reading
|
||||
- 11pt (22): modern default (Calibri, Aptos)
|
||||
- 12pt (24): traditional default (Times New Roman)
|
||||
- 14pt (28): maximum before body text looks oversized
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:restriction base="xs:positiveInteger">
|
||||
<xs:minInclusive value="20"/> <!-- 10pt minimum -->
|
||||
<xs:maxInclusive value="28"/> <!-- 14pt maximum -->
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- RULE 2: Heading Font Size Range -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- Headings must be 12-26pt (half-points: 24-52). -->
|
||||
<!-- WHY: Below 12pt, a heading cannot be visually distinguished -->
|
||||
<!-- from body text by size alone. -->
|
||||
<!-- Above 26pt is poster-sized and wastes vertical space. -->
|
||||
<!-- NOTE: Some academic styles use 12pt headings (same as body) -->
|
||||
<!-- and differentiate via bold/italic/centering instead. -->
|
||||
<!-- The lower bound of 24 (12pt) accommodates this. -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_AestheticHeadingFontSize">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Heading font size in half-points.
|
||||
Acceptable range: 24-52 (12pt-26pt).
|
||||
- 12pt (24): APA-style (hierarchy via bold/italic, not size)
|
||||
- 16pt (32): typical H2/H3
|
||||
- 20pt (40): typical H1
|
||||
- 26pt (52): maximum before headings dominate the page
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:restriction base="xs:positiveInteger">
|
||||
<xs:minInclusive value="24"/> <!-- 12pt minimum -->
|
||||
<xs:maxInclusive value="52"/> <!-- 26pt maximum -->
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- RULE 3: Line Spacing Range -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- Line spacing (in auto mode) must be 240-560 (1.0x-2.33x). -->
|
||||
<!-- WHY: Below 1.0x, ascenders/descenders overlap — unreadable. -->
|
||||
<!-- Above 2.33x, lines appear disconnected. -->
|
||||
<!-- Sweet spots: 1.15x (276) for sans, 1.5x (360) for -->
|
||||
<!-- generous layouts, 2.0x (480) for academic. -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_AestheticLineSpacing">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Line spacing value for auto line-spacing rule.
|
||||
In 240ths of single spacing: 240 = 1.0x, 480 = 2.0x.
|
||||
Acceptable range: 240-560 (1.0x to 2.33x).
|
||||
Common values:
|
||||
- 240: single spacing (dense, technical)
|
||||
- 259: Word's 1.08x default
|
||||
- 276: 1.15x (modern corporate default)
|
||||
- 336: 1.4x (executive/generous)
|
||||
- 360: 1.5x (generous/minimal)
|
||||
- 480: 2.0x (academic double spacing)
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:restriction base="xs:positiveInteger">
|
||||
<xs:minInclusive value="240"/> <!-- 1.0x single spacing -->
|
||||
<xs:maxInclusive value="560"/> <!-- ~2.33x — beyond double feels disconnected -->
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- RULE 3b: Fixed Line Spacing Range -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- For lineRule="exact", line value is in DXA (twentieths of pt) -->
|
||||
<!-- Range: 200-720 DXA (10pt-36pt fixed line height) -->
|
||||
<!-- Chinese government standard uses 560 DXA (28pt). -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_AestheticFixedLineSpacing">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Fixed line spacing value (lineRule="exact") in DXA.
|
||||
Acceptable range: 200-720 (10pt-36pt).
|
||||
- 560: Chinese government standard (28pt, for 16pt body)
|
||||
- 480: double-space equivalent for 12pt body
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:restriction base="xs:positiveInteger">
|
||||
<xs:minInclusive value="200"/> <!-- 10pt minimum fixed height -->
|
||||
<xs:maxInclusive value="720"/> <!-- 36pt maximum fixed height -->
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- RULE 4: Margin Minimums -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- All margins must be at least 720 DXA (0.5 inch). -->
|
||||
<!-- WHY: Below 0.5in, most printers clip content. -->
|
||||
<!-- Also, narrow margins create a cramped, unprofessional -->
|
||||
<!-- appearance. Even "full bleed" designs need internal -->
|
||||
<!-- text margins. -->
|
||||
<!-- Max set to 4320 DXA (3 inches) to prevent absurd margins. -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_AestheticMargin">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Page margin in DXA. Minimum 720 (0.5 inch), maximum 4320 (3 inches).
|
||||
Common values:
|
||||
- 720: 0.5in (minimum printable)
|
||||
- 1440: 1.0in (standard US)
|
||||
- 1588: 28mm (Chinese government left margin)
|
||||
- 1800: 1.25in (executive/premium)
|
||||
- 2160: 1.5in (binding margin or narrow-column design)
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:restriction base="xs:positiveInteger">
|
||||
<xs:minInclusive value="720"/> <!-- 0.5in — minimum for print safety -->
|
||||
<xs:maxInclusive value="4320"/> <!-- 3in — beyond this is absurd -->
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Top/bottom margins: signed because negative values can create -->
|
||||
<!-- overlap effects, but we still enforce a reasonable minimum. -->
|
||||
<xs:simpleType name="ST_AestheticVerticalMargin">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Vertical (top/bottom) page margin in DXA.
|
||||
Range: 360 to 4320 (0.25in to 3in).
|
||||
Slightly more permissive than horizontal margins because
|
||||
header/footer areas may reduce effective vertical margin.
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:restriction base="xs:integer">
|
||||
<xs:minInclusive value="360"/> <!-- 0.25in — tighter vertical is sometimes acceptable -->
|
||||
<xs:maxInclusive value="4320"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- RULE 5: Paragraph Spacing Ranges -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- Space before/after paragraphs should be 0-960 DXA (0-48pt). -->
|
||||
<!-- WHY: More than 48pt of space before/after creates awkward -->
|
||||
<!-- gaps that disrupt reading flow. -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_AestheticParaSpacing">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Paragraph spacing (before/after) in DXA.
|
||||
Range: 0-960 (0pt-48pt).
|
||||
Common values:
|
||||
- 0: academic style (uses first-line indent instead)
|
||||
- 80: 4pt (tight, used after H2/H3)
|
||||
- 120: 6pt (moderate)
|
||||
- 160: 8pt (standard modern spacing)
|
||||
- 200: 10pt (generous/executive)
|
||||
- 240: 12pt (very generous/minimal)
|
||||
- 480: 24pt (heading before — creates section break)
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:restriction base="xs:nonNegativeInteger">
|
||||
<xs:minInclusive value="0"/>
|
||||
<xs:maxInclusive value="960"/> <!-- 48pt max — beyond this is a page break, not spacing -->
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- RULE 6: Table Cell Padding Minimum -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- Table cells need at least 28 DXA (~1.4pt) padding. -->
|
||||
<!-- WHY: Without padding, text touches cell borders — visually -->
|
||||
<!-- cramped and hard to read. Even borderless tables need -->
|
||||
<!-- padding for column separation. -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_AestheticCellPadding">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Table cell padding in DXA. Minimum 28 DXA (~1.4pt).
|
||||
Recommended: 57 DXA (~2.85pt) for comfortable spacing.
|
||||
Maximum: 288 DXA (~14pt) — beyond this wastes space.
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:restriction base="xs:nonNegativeInteger">
|
||||
<xs:minInclusive value="28"/> <!-- ~1.4pt minimum breathing room -->
|
||||
<xs:maxInclusive value="288"/> <!-- ~14pt — more than this is excessive -->
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- RULE 7: Border Size Range -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- Border size (in eighth-points) should be 2-24 (0.25pt-3pt). -->
|
||||
<!-- WHY: Below 0.25pt borders may not render or print. -->
|
||||
<!-- Above 3pt borders look heavy and distracting. -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_AestheticBorderSize">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Border width in eighth-points.
|
||||
Range: 2-24 (0.25pt to 3pt).
|
||||
Common values:
|
||||
- 4: 0.5pt (thin, standard)
|
||||
- 6: 0.75pt (header separator in three-line tables)
|
||||
- 8: 1.0pt (medium, good for framing borders)
|
||||
- 12: 1.5pt (heavy, used for top/bottom in three-line tables)
|
||||
- 24: 3.0pt (maximum before borders dominate)
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:restriction base="xs:positiveInteger">
|
||||
<xs:minInclusive value="2"/> <!-- 0.25pt minimum visible -->
|
||||
<xs:maxInclusive value="24"/> <!-- 3pt maximum tasteful -->
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- RULE 8: Color Value Format -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- Colors must be valid 6-digit hex (RRGGBB) or "auto". -->
|
||||
<!-- This is structural validation, not aesthetic validation. -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_AestheticColor">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Color value: 6-digit hex (RRGGBB) or "auto".
|
||||
Examples: "000000", "1F3864", "2C3E50", "auto".
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:pattern value="[0-9A-Fa-f]{6}|auto"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- RULE 9: First-Line Indent Range -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- If first-line indent is used, it should be 360-1440 DXA -->
|
||||
<!-- (0.25in - 1.0in). -->
|
||||
<!-- WHY: Below 0.25in the indent is barely visible. -->
|
||||
<!-- Above 1.0in the indent looks like a tab error. -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_AestheticFirstLineIndent">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
First-line indent in DXA. Range: 0-1440 (0in to 1.0in).
|
||||
- 0: no indent (modern style with space-after)
|
||||
- 480: 0.33in (compact)
|
||||
- 640: ~0.44in (2 Chinese characters at 16pt)
|
||||
- 720: 0.5in (standard APA/academic)
|
||||
- 1440: 1.0in (maximum before it looks wrong)
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:restriction base="xs:nonNegativeInteger">
|
||||
<xs:minInclusive value="0"/>
|
||||
<xs:maxInclusive value="1440"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- COMPOSITE TYPE: Aesthetic Run Properties Check -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- Validates run-level properties for aesthetic compliance. -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_AestheticRPr">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Aesthetic run properties validator.
|
||||
Checks font size and color format at the run level.
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:all>
|
||||
<xs:element name="sz" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="w:ST_AestheticBodyFontSize" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="szCs" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="w:ST_AestheticBodyFontSize" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="color" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="w:ST_AestheticColor" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- COMPOSITE TYPE: Aesthetic Spacing Check -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_AestheticSpacing">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Aesthetic spacing validator for paragraph spacing properties.
|
||||
Validates line spacing and before/after spacing are in range.
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:attribute name="line" type="w:ST_AestheticLineSpacing" use="optional"/>
|
||||
<xs:attribute name="before" type="w:ST_AestheticParaSpacing" use="optional"/>
|
||||
<xs:attribute name="after" type="w:ST_AestheticParaSpacing" use="optional"/>
|
||||
<xs:attribute name="lineRule" use="optional">
|
||||
<xs:simpleType>
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="auto"/>
|
||||
<xs:enumeration value="exact"/>
|
||||
<xs:enumeration value="atLeast"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
</xs:attribute>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- COMPOSITE TYPE: Aesthetic Page Margins Check -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_AestheticPageMargins">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Aesthetic page margin validator.
|
||||
Ensures all margins meet minimum print-safe thresholds.
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:attribute name="top" type="w:ST_AestheticVerticalMargin" use="required"/>
|
||||
<xs:attribute name="bottom" type="w:ST_AestheticVerticalMargin" use="required"/>
|
||||
<xs:attribute name="left" type="w:ST_AestheticMargin" use="required"/>
|
||||
<xs:attribute name="right" type="w:ST_AestheticMargin" use="required"/>
|
||||
<xs:attribute name="header" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="footer" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="gutter" type="xs:nonNegativeInteger" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- COMPOSITE TYPE: Aesthetic Table Cell Margin Check -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_AestheticTableCellMargin">
|
||||
<xs:annotation>
|
||||
<xs:documentation>
|
||||
Aesthetic table cell margin validator.
|
||||
Ensures minimum padding for readability.
|
||||
</xs:documentation>
|
||||
</xs:annotation>
|
||||
<xs:attribute name="w" type="w:ST_AestheticCellPadding" use="required"/>
|
||||
<xs:attribute name="type" use="required">
|
||||
<xs:simpleType>
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="dxa"/>
|
||||
<xs:enumeration value="nil"/>
|
||||
<xs:enumeration value="pct"/>
|
||||
<xs:enumeration value="auto"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
</xs:attribute>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- PROGRAMMATIC RULES (cannot be expressed in XSD) -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- -->
|
||||
<!-- The following rules must be checked by a programmatic -->
|
||||
<!-- validator (e.g., AestheticRuleValidator.cs). They are -->
|
||||
<!-- documented here for completeness. -->
|
||||
<!-- -->
|
||||
<!-- ── RULE P1: Heading Size Hierarchy ── -->
|
||||
<!-- H1 sz >= H2 sz >= H3 sz >= body sz -->
|
||||
<!-- Exception: APA-style where all headings = body size. -->
|
||||
<!-- Implementation: Collect sz from Heading1/2/3 styles and -->
|
||||
<!-- docDefaults. Verify monotonic decrease (or equality). -->
|
||||
<!-- -->
|
||||
<!-- ── RULE P2: Maximum 3 Font Families ── -->
|
||||
<!-- Across docDefaults rPr + all style rPr, at most 3 distinct -->
|
||||
<!-- font families (by Ascii name) should be used. -->
|
||||
<!-- WHY: More than 3 fonts creates visual chaos. Professional -->
|
||||
<!-- designs typically use 1-2 families. -->
|
||||
<!-- Implementation: Collect all rFonts.ascii values from -->
|
||||
<!-- docDefaults and all styles. Count distinct. Warn if > 3. -->
|
||||
<!-- -->
|
||||
<!-- ── RULE P3: Heading Space-Before >= Space-After ── -->
|
||||
<!-- For heading styles, spaceBefore should be >= spaceAfter. -->
|
||||
<!-- WHY: Headings should be visually closer to the content they -->
|
||||
<!-- introduce than to the content above. This is the -->
|
||||
<!-- "proximity principle" of Gestalt design. -->
|
||||
<!-- Implementation: For each Heading style, compare pPr spacing -->
|
||||
<!-- before vs after values. -->
|
||||
<!-- -->
|
||||
<!-- ── RULE P4: Spacing-Size Coherence ── -->
|
||||
<!-- Paragraph after-spacing should be proportional to body size: -->
|
||||
<!-- after >= bodySize * 0.5 AND after <= bodySize * 1.5 -->
|
||||
<!-- WHY: Too little spacing makes paragraphs run together. -->
|
||||
<!-- Too much spacing disconnects them. -->
|
||||
<!-- Implementation: Get body sz from docDefaults, convert to DXA -->
|
||||
<!-- (multiply by 10), check after-spacing ratio. -->
|
||||
<!-- -->
|
||||
<!-- ── RULE P5: Color Consistency ── -->
|
||||
<!-- All heading styles should use the same color value. -->
|
||||
<!-- Body text color (if set) should be consistent across styles. -->
|
||||
<!-- WHY: Inconsistent colors look accidental, not designed. -->
|
||||
<!-- Exception: Caption and footnote styles may differ. -->
|
||||
<!-- Implementation: Collect color.val from heading styles. -->
|
||||
<!-- Verify all are identical. -->
|
||||
<!-- -->
|
||||
<!-- ── RULE P6: Indent/Spacing Mutual Exclusion ── -->
|
||||
<!-- If first-line indent > 0 in docDefaults, then after-spacing -->
|
||||
<!-- should be 0 (and vice versa). Using BOTH indent AND spacing -->
|
||||
<!-- is visually redundant — it signals uncertainty. -->
|
||||
<!-- Exception: Headings may override this. -->
|
||||
<!-- Implementation: Check docDefaults pPr. If firstLine > 0 AND -->
|
||||
<!-- after > 0, emit a warning (not error). -->
|
||||
<!-- -->
|
||||
<!-- ── RULE P7: Table Border Consistency ── -->
|
||||
<!-- Within a single table, border styles should be internally -->
|
||||
<!-- consistent (all single, or all none — not a random mix). -->
|
||||
<!-- Implementation: Check tblBorders for consistent val values. -->
|
||||
<!-- -->
|
||||
<!-- ── RULE P8: Line Spacing vs Font Size Harmony ── -->
|
||||
<!-- For fixed line spacing (lineRule="exact"): -->
|
||||
<!-- lineHeight >= fontSize * 1.2 -->
|
||||
<!-- WHY: Fixed line spacing less than 1.2x the font size causes -->
|
||||
<!-- ascender/descender clipping. -->
|
||||
<!-- Implementation: When lineRule="exact", compare line value -->
|
||||
<!-- against the effective font size. -->
|
||||
<!-- -->
|
||||
<!-- ============================================================ -->
|
||||
|
||||
</xs:schema>
|
||||
@@ -0,0 +1,130 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!-- Business Rules Gate-Check Schema for minimax-docx -->
|
||||
<!-- Used in Scenario C (template application) as hard gate -->
|
||||
<!-- Validates business compliance beyond XML correctness -->
|
||||
<!-- MIT License - minimax-docx project -->
|
||||
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
||||
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
targetNamespace="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
elementFormDefault="qualified">
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Page margins: constrained to reasonable bounds -->
|
||||
<!-- Minimum 360 DXA (0.25 inch), maximum 4320 DXA (3 inches) -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_MarginMeasure">
|
||||
<xs:restriction base="xs:integer">
|
||||
<xs:minInclusive value="360"/>
|
||||
<xs:maxInclusive value="4320"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Signed margin (top/bottom can be negative for overlap) -->
|
||||
<xs:simpleType name="ST_SignedMarginMeasure">
|
||||
<xs:restriction base="xs:integer">
|
||||
<xs:minInclusive value="-4320"/>
|
||||
<xs:maxInclusive value="4320"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Font size constraints -->
|
||||
<!-- Body text: 16-144 half-points (8-72pt) -->
|
||||
<!-- Heading text: 20-192 half-points (10-96pt) -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:simpleType name="ST_BodyFontSize">
|
||||
<xs:restriction base="xs:positiveInteger">
|
||||
<xs:minInclusive value="16"/>
|
||||
<xs:maxInclusive value="144"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<xs:simpleType name="ST_HeadingFontSize">
|
||||
<xs:restriction base="xs:positiveInteger">
|
||||
<xs:minInclusive value="20"/>
|
||||
<xs:maxInclusive value="192"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Valid standard page sizes (width x height in DXA) -->
|
||||
<!-- ============================================================ -->
|
||||
<!-- Letter: 12240 x 15840 -->
|
||||
<!-- A4: 11906 x 16838 -->
|
||||
<!-- Legal: 12240 x 20160 -->
|
||||
<!-- A3: 16838 x 23811 -->
|
||||
<!-- A5: 8391 x 11906 -->
|
||||
|
||||
<xs:simpleType name="ST_PageWidth">
|
||||
<xs:restriction base="xs:positiveInteger">
|
||||
<xs:minInclusive value="5040"/>
|
||||
<xs:maxInclusive value="31680"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<xs:simpleType name="ST_PageHeight">
|
||||
<xs:restriction base="xs:positiveInteger">
|
||||
<xs:minInclusive value="5040"/>
|
||||
<xs:maxInclusive value="31680"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Constrained section properties for gate-check -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_GateCheckSectPr">
|
||||
<xs:all>
|
||||
<xs:element name="pgSz" minOccurs="1">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="w" type="w:ST_PageWidth" use="required"/>
|
||||
<xs:attribute name="h" type="w:ST_PageHeight" use="required"/>
|
||||
<xs:attribute name="orient" use="optional">
|
||||
<xs:simpleType>
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="portrait"/>
|
||||
<xs:enumeration value="landscape"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
</xs:attribute>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="pgMar" minOccurs="1">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="top" type="w:ST_SignedMarginMeasure" use="required"/>
|
||||
<xs:attribute name="bottom" type="w:ST_SignedMarginMeasure" use="required"/>
|
||||
<xs:attribute name="left" type="w:ST_MarginMeasure" use="required"/>
|
||||
<xs:attribute name="right" type="w:ST_MarginMeasure" use="required"/>
|
||||
<xs:attribute name="header" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="footer" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="gutter" type="xs:nonNegativeInteger" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Required styles: at minimum Normal and Heading1 must exist -->
|
||||
<!-- This is enforced programmatically by GateCheckValidator -->
|
||||
<!-- rather than via XSD, since XSD cannot validate style presence -->
|
||||
<!-- across separate XML parts. -->
|
||||
<!-- ============================================================ -->
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Constrained run properties for font size validation -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_GateCheckRPr">
|
||||
<xs:all>
|
||||
<xs:element name="sz" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="w:ST_BodyFontSize" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="szCs" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="w:ST_BodyFontSize" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
</xs:schema>
|
||||
@@ -0,0 +1,159 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!-- Common type definitions for WordprocessingML subset schema -->
|
||||
<!-- MIT License - minimax-docx project -->
|
||||
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
||||
targetNamespace="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
elementFormDefault="qualified">
|
||||
|
||||
<!-- Measurement: non-negative twips (1/1440 inch) -->
|
||||
<xs:simpleType name="ST_TwipsMeasure">
|
||||
<xs:restriction base="xs:nonNegativeInteger"/>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Measurement: signed twips (for negative margins/indents) -->
|
||||
<xs:simpleType name="ST_SignedTwipsMeasure">
|
||||
<xs:restriction base="xs:integer"/>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Half-point measure for font sizes (1 = 0.5pt) -->
|
||||
<xs:simpleType name="ST_HpsMeasure">
|
||||
<xs:restriction base="xs:positiveInteger"/>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Hex color: 6 hex digits -->
|
||||
<xs:simpleType name="ST_HexColor">
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:pattern value="auto|[0-9a-fA-F]{6}"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- On/Off toggle -->
|
||||
<xs:simpleType name="ST_OnOff">
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="true"/>
|
||||
<xs:enumeration value="false"/>
|
||||
<xs:enumeration value="0"/>
|
||||
<xs:enumeration value="1"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Justification -->
|
||||
<xs:simpleType name="ST_Jc">
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="left"/>
|
||||
<xs:enumeration value="center"/>
|
||||
<xs:enumeration value="right"/>
|
||||
<xs:enumeration value="both"/>
|
||||
<xs:enumeration value="distribute"/>
|
||||
<xs:enumeration value="start"/>
|
||||
<xs:enumeration value="end"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Break type -->
|
||||
<xs:simpleType name="ST_BrType">
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="page"/>
|
||||
<xs:enumeration value="column"/>
|
||||
<xs:enumeration value="textWrapping"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Underline patterns -->
|
||||
<xs:simpleType name="ST_Underline">
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="none"/>
|
||||
<xs:enumeration value="single"/>
|
||||
<xs:enumeration value="words"/>
|
||||
<xs:enumeration value="double"/>
|
||||
<xs:enumeration value="thick"/>
|
||||
<xs:enumeration value="dotted"/>
|
||||
<xs:enumeration value="dash"/>
|
||||
<xs:enumeration value="dotDash"/>
|
||||
<xs:enumeration value="dotDotDash"/>
|
||||
<xs:enumeration value="wave"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Vertical alignment for subscript/superscript -->
|
||||
<xs:simpleType name="ST_VerticalAlignRun">
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="baseline"/>
|
||||
<xs:enumeration value="superscript"/>
|
||||
<xs:enumeration value="subscript"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Section break type -->
|
||||
<xs:simpleType name="ST_SectionMark">
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="nextPage"/>
|
||||
<xs:enumeration value="nextColumn"/>
|
||||
<xs:enumeration value="continuous"/>
|
||||
<xs:enumeration value="evenPage"/>
|
||||
<xs:enumeration value="oddPage"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Header/footer type -->
|
||||
<xs:simpleType name="ST_HdrFtr">
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="even"/>
|
||||
<xs:enumeration value="default"/>
|
||||
<xs:enumeration value="first"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Table width type -->
|
||||
<xs:simpleType name="ST_TblWidth">
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="auto"/>
|
||||
<xs:enumeration value="dxa"/>
|
||||
<xs:enumeration value="nil"/>
|
||||
<xs:enumeration value="pct"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Vertical merge -->
|
||||
<xs:simpleType name="ST_Merge">
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="continue"/>
|
||||
<xs:enumeration value="restart"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Highlight colors -->
|
||||
<xs:simpleType name="ST_HighlightColor">
|
||||
<xs:restriction base="xs:string">
|
||||
<xs:enumeration value="black"/>
|
||||
<xs:enumeration value="blue"/>
|
||||
<xs:enumeration value="cyan"/>
|
||||
<xs:enumeration value="darkBlue"/>
|
||||
<xs:enumeration value="darkCyan"/>
|
||||
<xs:enumeration value="darkGray"/>
|
||||
<xs:enumeration value="darkGreen"/>
|
||||
<xs:enumeration value="darkMagenta"/>
|
||||
<xs:enumeration value="darkRed"/>
|
||||
<xs:enumeration value="darkYellow"/>
|
||||
<xs:enumeration value="green"/>
|
||||
<xs:enumeration value="lightGray"/>
|
||||
<xs:enumeration value="magenta"/>
|
||||
<xs:enumeration value="none"/>
|
||||
<xs:enumeration value="red"/>
|
||||
<xs:enumeration value="white"/>
|
||||
<xs:enumeration value="yellow"/>
|
||||
</xs:restriction>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Percentage (for table width pct, etc.) -->
|
||||
<xs:simpleType name="ST_DecimalNumber">
|
||||
<xs:restriction base="xs:integer"/>
|
||||
</xs:simpleType>
|
||||
|
||||
<!-- Relationship ID reference -->
|
||||
<xs:simpleType name="ST_RelationshipId">
|
||||
<xs:restriction base="xs:string"/>
|
||||
</xs:simpleType>
|
||||
|
||||
</xs:schema>
|
||||
@@ -0,0 +1,589 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!-- WordprocessingML Subset Schema for minimax-docx -->
|
||||
<!-- Curated subset of ISO 29500 covering elements agents commonly generate -->
|
||||
<!-- MIT License - minimax-docx project -->
|
||||
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
||||
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
||||
xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"
|
||||
targetNamespace="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
elementFormDefault="qualified">
|
||||
|
||||
<xs:import namespace="http://schemas.openxmlformats.org/officeDocument/2006/relationships"/>
|
||||
<xs:import namespace="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"/>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Root element -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:element name="document" type="w:CT_Document"/>
|
||||
|
||||
<xs:complexType name="CT_Document">
|
||||
<xs:sequence>
|
||||
<xs:element name="body" type="w:CT_Body" minOccurs="0"/>
|
||||
</xs:sequence>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Body -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_Body">
|
||||
<xs:sequence>
|
||||
<xs:choice minOccurs="0" maxOccurs="unbounded">
|
||||
<xs:element name="p" type="w:CT_P"/>
|
||||
<xs:element name="tbl" type="w:CT_Tbl"/>
|
||||
<xs:element name="sdt" type="w:CT_SdtBlock"/>
|
||||
<xs:element name="bookmarkStart" type="w:CT_BookmarkStart"/>
|
||||
<xs:element name="bookmarkEnd" type="w:CT_BookmarkEnd"/>
|
||||
</xs:choice>
|
||||
<xs:element name="sectPr" type="w:CT_SectPr" minOccurs="0"/>
|
||||
</xs:sequence>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Paragraph -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_P">
|
||||
<xs:sequence>
|
||||
<xs:element name="pPr" type="w:CT_PPr" minOccurs="0"/>
|
||||
<xs:choice minOccurs="0" maxOccurs="unbounded">
|
||||
<xs:element name="r" type="w:CT_R"/>
|
||||
<xs:element name="hyperlink" type="w:CT_Hyperlink"/>
|
||||
<xs:element name="bookmarkStart" type="w:CT_BookmarkStart"/>
|
||||
<xs:element name="bookmarkEnd" type="w:CT_BookmarkEnd"/>
|
||||
<xs:element name="commentRangeStart" type="w:CT_MarkupRange"/>
|
||||
<xs:element name="commentRangeEnd" type="w:CT_MarkupRange"/>
|
||||
<xs:element name="ins" type="w:CT_RunTrackChange"/>
|
||||
<xs:element name="del" type="w:CT_RunTrackChange"/>
|
||||
</xs:choice>
|
||||
</xs:sequence>
|
||||
<xs:attribute ref="r:id" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- Paragraph Properties -->
|
||||
<xs:complexType name="CT_PPr">
|
||||
<xs:all>
|
||||
<xs:element name="pStyle" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="keepNext" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="keepLines" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="pageBreakBefore" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="widowControl" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="numPr" type="w:CT_NumPr" minOccurs="0"/>
|
||||
<xs:element name="spacing" type="w:CT_Spacing" minOccurs="0"/>
|
||||
<xs:element name="ind" type="w:CT_Ind" minOccurs="0"/>
|
||||
<xs:element name="jc" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="outlineLvl" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:integer" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="rPr" type="w:CT_RPr" minOccurs="0"/>
|
||||
<xs:element name="pBdr" type="w:CT_PBdr" minOccurs="0"/>
|
||||
<xs:element name="shd" type="w:CT_Shd" minOccurs="0"/>
|
||||
<xs:element name="tabs" type="w:CT_Tabs" minOccurs="0"/>
|
||||
<xs:element name="sectPr" type="w:CT_SectPr" minOccurs="0"/>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Run -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_R">
|
||||
<xs:sequence>
|
||||
<xs:element name="rPr" type="w:CT_RPr" minOccurs="0"/>
|
||||
<xs:choice minOccurs="0" maxOccurs="unbounded">
|
||||
<xs:element name="t" type="w:CT_Text"/>
|
||||
<xs:element name="delText" type="w:CT_Text"/>
|
||||
<xs:element name="br" type="w:CT_Br"/>
|
||||
<xs:element name="tab" type="w:CT_Empty"/>
|
||||
<xs:element name="cr" type="w:CT_Empty"/>
|
||||
<xs:element name="drawing" type="w:CT_Drawing"/>
|
||||
<xs:element name="commentReference" type="w:CT_MarkupRef"/>
|
||||
<xs:element name="footnoteReference" type="w:CT_FtnEdnRef"/>
|
||||
<xs:element name="endnoteReference" type="w:CT_FtnEdnRef"/>
|
||||
</xs:choice>
|
||||
</xs:sequence>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- Run Properties -->
|
||||
<xs:complexType name="CT_RPr">
|
||||
<xs:all>
|
||||
<xs:element name="rStyle" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="rFonts" type="w:CT_Fonts" minOccurs="0"/>
|
||||
<xs:element name="b" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="bCs" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="i" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="iCs" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="caps" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="smallCaps" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="strike" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="dstrike" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="vanish" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="color" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
<xs:attribute name="themeColor" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="spacing" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:integer" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="sz" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:positiveInteger" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="szCs" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:positiveInteger" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="highlight" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="u" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
<xs:attribute name="color" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="vertAlign" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="lang" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="eastAsia" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="bidi" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Text -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_Text" mixed="true">
|
||||
<xs:attribute ref="xml:space" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Table -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_Tbl">
|
||||
<xs:sequence>
|
||||
<xs:element name="tblPr" type="w:CT_TblPr" minOccurs="0"/>
|
||||
<xs:element name="tblGrid" type="w:CT_TblGrid" minOccurs="0"/>
|
||||
<xs:choice minOccurs="0" maxOccurs="unbounded">
|
||||
<xs:element name="tr" type="w:CT_Row"/>
|
||||
<xs:element name="bookmarkStart" type="w:CT_BookmarkStart"/>
|
||||
<xs:element name="bookmarkEnd" type="w:CT_BookmarkEnd"/>
|
||||
</xs:choice>
|
||||
</xs:sequence>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_TblPr">
|
||||
<xs:all>
|
||||
<xs:element name="tblStyle" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="tblW" type="w:CT_TblWidth" minOccurs="0"/>
|
||||
<xs:element name="jc" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="tblInd" type="w:CT_TblWidth" minOccurs="0"/>
|
||||
<xs:element name="tblBorders" type="w:CT_TblBorders" minOccurs="0"/>
|
||||
<xs:element name="shd" type="w:CT_Shd" minOccurs="0"/>
|
||||
<xs:element name="tblLayout" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="type" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="tblCellMar" type="w:CT_TblCellMar" minOccurs="0"/>
|
||||
<xs:element name="tblLook" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="firstRow" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="lastRow" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="firstColumn" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="lastColumn" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="noHBand" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="noVBand" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_TblGrid">
|
||||
<xs:sequence>
|
||||
<xs:element name="gridCol" minOccurs="0" maxOccurs="unbounded">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="w" type="xs:nonNegativeInteger" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
</xs:sequence>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_Row">
|
||||
<xs:sequence>
|
||||
<xs:element name="trPr" type="w:CT_TrPr" minOccurs="0"/>
|
||||
<xs:choice minOccurs="0" maxOccurs="unbounded">
|
||||
<xs:element name="tc" type="w:CT_Cell"/>
|
||||
<xs:element name="bookmarkStart" type="w:CT_BookmarkStart"/>
|
||||
<xs:element name="bookmarkEnd" type="w:CT_BookmarkEnd"/>
|
||||
</xs:choice>
|
||||
</xs:sequence>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_TrPr">
|
||||
<xs:all>
|
||||
<xs:element name="trHeight" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="hRule" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="tblHeader" type="w:CT_OnOff" minOccurs="0"/>
|
||||
<xs:element name="jc" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_Cell">
|
||||
<xs:sequence>
|
||||
<xs:element name="tcPr" type="w:CT_TcPr" minOccurs="0"/>
|
||||
<xs:choice minOccurs="0" maxOccurs="unbounded">
|
||||
<xs:element name="p" type="w:CT_P"/>
|
||||
<xs:element name="tbl" type="w:CT_Tbl"/>
|
||||
</xs:choice>
|
||||
</xs:sequence>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_TcPr">
|
||||
<xs:all>
|
||||
<xs:element name="tcW" type="w:CT_TblWidth" minOccurs="0"/>
|
||||
<xs:element name="gridSpan" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:positiveInteger" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="vMerge" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="tcBorders" type="w:CT_TcBorders" minOccurs="0"/>
|
||||
<xs:element name="shd" type="w:CT_Shd" minOccurs="0"/>
|
||||
<xs:element name="vAlign" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="noWrap" type="w:CT_OnOff" minOccurs="0"/>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Section Properties -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_SectPr">
|
||||
<xs:all>
|
||||
<xs:element name="headerReference" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="type" type="xs:string" use="required"/>
|
||||
<xs:attribute ref="r:id" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="footerReference" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="type" type="xs:string" use="required"/>
|
||||
<xs:attribute ref="r:id" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="type" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="pgSz" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="w" type="xs:nonNegativeInteger" use="required"/>
|
||||
<xs:attribute name="h" type="xs:nonNegativeInteger" use="required"/>
|
||||
<xs:attribute name="orient" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="pgMar" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="top" type="xs:integer" use="required"/>
|
||||
<xs:attribute name="right" type="xs:nonNegativeInteger" use="required"/>
|
||||
<xs:attribute name="bottom" type="xs:integer" use="required"/>
|
||||
<xs:attribute name="left" type="xs:nonNegativeInteger" use="required"/>
|
||||
<xs:attribute name="header" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="footer" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="gutter" type="xs:nonNegativeInteger" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="pgNumType" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="fmt" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="start" type="xs:nonNegativeInteger" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="cols" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="space" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="num" type="xs:positiveInteger" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="docGrid" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="linePitch" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="type" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="titlePg" type="w:CT_OnOff" minOccurs="0"/>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Hyperlink -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_Hyperlink">
|
||||
<xs:sequence>
|
||||
<xs:element name="r" type="w:CT_R" minOccurs="0" maxOccurs="unbounded"/>
|
||||
</xs:sequence>
|
||||
<xs:attribute ref="r:id" use="optional"/>
|
||||
<xs:attribute name="anchor" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="history" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Track Changes -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_RunTrackChange">
|
||||
<xs:sequence>
|
||||
<xs:choice minOccurs="0" maxOccurs="unbounded">
|
||||
<xs:element name="r" type="w:CT_R"/>
|
||||
</xs:choice>
|
||||
</xs:sequence>
|
||||
<xs:attribute name="id" type="xs:nonNegativeInteger" use="required"/>
|
||||
<xs:attribute name="author" type="xs:string" use="required"/>
|
||||
<xs:attribute name="date" type="xs:dateTime" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Bookmarks -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_BookmarkStart">
|
||||
<xs:attribute name="id" type="xs:nonNegativeInteger" use="required"/>
|
||||
<xs:attribute name="name" type="xs:string" use="required"/>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_BookmarkEnd">
|
||||
<xs:attribute name="id" type="xs:nonNegativeInteger" use="required"/>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Comments -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_MarkupRange">
|
||||
<xs:attribute name="id" type="xs:nonNegativeInteger" use="required"/>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_MarkupRef">
|
||||
<xs:attribute name="id" type="xs:nonNegativeInteger" use="required"/>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Footnote/Endnote reference -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_FtnEdnRef">
|
||||
<xs:attribute name="id" type="xs:nonNegativeInteger" use="required"/>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Drawing (basic inline image) -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_Drawing">
|
||||
<xs:sequence>
|
||||
<xs:any namespace="##other" processContents="lax" minOccurs="0" maxOccurs="unbounded"/>
|
||||
</xs:sequence>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Structured Document Tag (content control) -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_SdtBlock">
|
||||
<xs:sequence>
|
||||
<xs:element name="sdtPr" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:sequence>
|
||||
<xs:any processContents="lax" minOccurs="0" maxOccurs="unbounded"/>
|
||||
</xs:sequence>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="sdtContent" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:choice minOccurs="0" maxOccurs="unbounded">
|
||||
<xs:element name="p" type="w:CT_P"/>
|
||||
<xs:element name="tbl" type="w:CT_Tbl"/>
|
||||
</xs:choice>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
</xs:sequence>
|
||||
</xs:complexType>
|
||||
|
||||
<!-- ============================================================ -->
|
||||
<!-- Helper types -->
|
||||
<!-- ============================================================ -->
|
||||
<xs:complexType name="CT_OnOff">
|
||||
<xs:attribute name="val" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_Empty"/>
|
||||
|
||||
<xs:complexType name="CT_Br">
|
||||
<xs:attribute name="type" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="clear" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_Fonts">
|
||||
<xs:attribute name="ascii" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="hAnsi" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="eastAsia" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="cs" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="asciiTheme" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="hAnsiTheme" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="eastAsiaTheme" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="cstheme" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_NumPr">
|
||||
<xs:all>
|
||||
<xs:element name="ilvl" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:nonNegativeInteger" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
<xs:element name="numId" minOccurs="0">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:nonNegativeInteger" use="required"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_Spacing">
|
||||
<xs:attribute name="before" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="after" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="line" type="xs:integer" use="optional"/>
|
||||
<xs:attribute name="lineRule" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="beforeAutospacing" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="afterAutospacing" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_Ind">
|
||||
<xs:attribute name="left" type="xs:integer" use="optional"/>
|
||||
<xs:attribute name="right" type="xs:integer" use="optional"/>
|
||||
<xs:attribute name="hanging" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="firstLine" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="start" type="xs:integer" use="optional"/>
|
||||
<xs:attribute name="end" type="xs:integer" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_TblWidth">
|
||||
<xs:attribute name="w" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="type" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_Shd">
|
||||
<xs:attribute name="val" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="color" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="fill" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="themeFill" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_Border">
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
<xs:attribute name="sz" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="space" type="xs:nonNegativeInteger" use="optional"/>
|
||||
<xs:attribute name="color" type="xs:string" use="optional"/>
|
||||
<xs:attribute name="themeColor" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_PBdr">
|
||||
<xs:all>
|
||||
<xs:element name="top" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="left" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="bottom" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="right" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="between" type="w:CT_Border" minOccurs="0"/>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_TblBorders">
|
||||
<xs:all>
|
||||
<xs:element name="top" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="left" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="bottom" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="right" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="insideH" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="insideV" type="w:CT_Border" minOccurs="0"/>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_TcBorders">
|
||||
<xs:all>
|
||||
<xs:element name="top" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="left" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="bottom" type="w:CT_Border" minOccurs="0"/>
|
||||
<xs:element name="right" type="w:CT_Border" minOccurs="0"/>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_TblCellMar">
|
||||
<xs:all>
|
||||
<xs:element name="top" type="w:CT_TblWidth" minOccurs="0"/>
|
||||
<xs:element name="left" type="w:CT_TblWidth" minOccurs="0"/>
|
||||
<xs:element name="bottom" type="w:CT_TblWidth" minOccurs="0"/>
|
||||
<xs:element name="right" type="w:CT_TblWidth" minOccurs="0"/>
|
||||
</xs:all>
|
||||
</xs:complexType>
|
||||
|
||||
<xs:complexType name="CT_Tabs">
|
||||
<xs:sequence>
|
||||
<xs:element name="tab" minOccurs="0" maxOccurs="unbounded">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="val" type="xs:string" use="required"/>
|
||||
<xs:attribute name="pos" type="xs:integer" use="required"/>
|
||||
<xs:attribute name="leader" type="xs:string" use="optional"/>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
</xs:sequence>
|
||||
</xs:complexType>
|
||||
|
||||
</xs:schema>
|
||||
@@ -0,0 +1,357 @@
|
||||
# CJK Typography & Mixed-Script Guide
|
||||
|
||||
Rules for Chinese, Japanese, and Korean text in DOCX documents.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [Font Selection](#font-selection)
|
||||
2. [Font Size Names (CJK)](#font-size-names)
|
||||
3. [RunFonts Mapping](#runfonts-mapping)
|
||||
4. [Punctuation & Line Breaking](#punctuation--line-breaking)
|
||||
5. [Paragraph Indentation](#paragraph-indentation)
|
||||
6. [Line Spacing for CJK](#line-spacing)
|
||||
7. [Chinese Government Standard (GB/T 9704)](#gbt-9704)
|
||||
8. [Mixed CJK + Latin Best Practices](#mixed-script)
|
||||
9. [OpenXML Quick Reference](#openxml-quick-reference)
|
||||
|
||||
---
|
||||
|
||||
## Font Selection
|
||||
|
||||
### Recommended CJK Fonts
|
||||
|
||||
| Language | Serif (正文) | Sans (标题) | Notes |
|
||||
|----------|-------------|-------------|-------|
|
||||
| **Simplified Chinese** | 宋体 (SimSun) | 微软雅黑 (Microsoft YaHei) | YaHei for screen, SimSun for print |
|
||||
| **Simplified Chinese** | 仿宋 (FangSong) | 黑体 (SimHei) | Government documents |
|
||||
| **Traditional Chinese** | 新細明體 (PMingLiU) | 微軟正黑體 (Microsoft JhengHei) | Taiwan standard |
|
||||
| **Japanese** | MS 明朝 (MS Mincho) | MS ゴシック (MS Gothic) | Classic pairing |
|
||||
| **Japanese** | 游明朝 (Yu Mincho) | 游ゴシック (Yu Gothic) | Modern, Windows 10+ |
|
||||
| **Korean** | 바탕 (Batang) | 맑은 고딕 (Malgun Gothic) | Standard pairing |
|
||||
|
||||
### Government Document Fonts (公文)
|
||||
|
||||
| Element | Font | Size |
|
||||
|---------|------|------|
|
||||
| 标题 (title) | 小标宋 (FZXiaoBiaoSong-B05S) | 二号 (22pt) |
|
||||
| 一级标题 | 黑体 (SimHei) | 三号 (16pt) |
|
||||
| 二级标题 | 楷体_GB2312 (KaiTi_GB2312) | 三号 (16pt) |
|
||||
| 三级标题 | 仿宋_GB2312 加粗 | 三号 (16pt) |
|
||||
| 正文 (body) | 仿宋_GB2312 (FangSong_GB2312) | 三号 (16pt) |
|
||||
| 附注/页码 | 宋体 (SimSun) | 四号 (14pt) |
|
||||
|
||||
---
|
||||
|
||||
## Font Size Names
|
||||
|
||||
CJK uses named sizes. Map to points and `w:sz` half-point values:
|
||||
|
||||
| 字号 | Points | `w:sz` | Common Use |
|
||||
|------|--------|--------|------------|
|
||||
| 初号 | 42pt | 84 | Display title |
|
||||
| 小初 | 36pt | 72 | Large title |
|
||||
| 一号 | 26pt | 52 | Chapter heading |
|
||||
| 小一 | 24pt | 48 | Major heading |
|
||||
| 二号 | 22pt | 44 | Document title (公文) |
|
||||
| 小二 | 18pt | 36 | Western H1 equivalent |
|
||||
| 三号 | 16pt | 32 | CJK heading / 公文 body |
|
||||
| 小三 | 15pt | 30 | Sub-heading |
|
||||
| 四号 | 14pt | 28 | CJK subheading |
|
||||
| 小四 | 12pt | 24 | Standard body (CJK) |
|
||||
| 五号 | 10.5pt | 21 | Compact CJK body |
|
||||
| 小五 | 9pt | 18 | Footnotes |
|
||||
| 六号 | 7.5pt | 15 | Fine print |
|
||||
|
||||
---
|
||||
|
||||
## RunFonts Mapping
|
||||
|
||||
OpenXML uses four font slots to handle multilingual text:
|
||||
|
||||
```xml
|
||||
<w:rFonts
|
||||
w:ascii="Calibri" <!-- Latin characters (U+0000–U+007F) -->
|
||||
w:hAnsi="Calibri" <!-- Latin extended, Greek, Cyrillic -->
|
||||
w:eastAsia="SimSun" <!-- CJK Unified Ideographs, Kana, Hangul -->
|
||||
w:cs="Arial" <!-- Arabic, Hebrew, Thai, Devanagari -->
|
||||
/>
|
||||
```
|
||||
|
||||
**Word's character classification logic:**
|
||||
|
||||
1. Character is in CJK range → uses `w:eastAsia` font
|
||||
2. Character is in complex script range → uses `w:cs` font
|
||||
3. Character is basic Latin (ASCII) → uses `w:ascii` font
|
||||
4. Everything else → uses `w:hAnsi` font
|
||||
|
||||
**Key**: `w:eastAsia` is the **only** way to set CJK fonts. Setting just `w:ascii` will NOT affect CJK characters. Mixed text within a single run auto-switches fonts at the character level — no need for separate runs.
|
||||
|
||||
### Document Defaults
|
||||
|
||||
```xml
|
||||
<w:docDefaults>
|
||||
<w:rPrDefault>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" w:eastAsia="SimSun" w:cs="Arial" />
|
||||
<w:sz w:val="22" />
|
||||
<w:szCs w:val="22" />
|
||||
<w:lang w:val="en-US" w:eastAsia="zh-CN" />
|
||||
</w:rPr>
|
||||
</w:rPrDefault>
|
||||
</w:docDefaults>
|
||||
```
|
||||
|
||||
`w:lang w:eastAsia` helps Word resolve ambiguous characters (e.g., punctuation shared between CJK and Latin).
|
||||
|
||||
---
|
||||
|
||||
## Punctuation & Line Breaking
|
||||
|
||||
### Full-Width vs Half-Width
|
||||
|
||||
CJK text uses full-width punctuation:
|
||||
|
||||
| Type | CJK | Latin |
|
||||
|------|-----|-------|
|
||||
| Period | 。(U+3002) | . |
|
||||
| Comma | ,(U+FF0C) 、(U+3001) | , |
|
||||
| Colon | :(U+FF1A) | : |
|
||||
| Semicolon | ;(U+FF1B) | ; |
|
||||
| Quotes | 「」『』 or ""'' | "" '' |
|
||||
| Parentheses | ()(U+FF08/09) | () |
|
||||
|
||||
In mixed text, use the punctuation style of the **surrounding language context**.
|
||||
|
||||
### OpenXML Controls
|
||||
|
||||
```xml
|
||||
<w:pPr>
|
||||
<w:adjustRightInd w:val="true" /> <!-- Adjust right indent for CJK punctuation -->
|
||||
<w:snapToGrid w:val="true" /> <!-- Align to document grid -->
|
||||
<w:kinsoku w:val="true" /> <!-- Enable CJK line breaking rules -->
|
||||
<w:overflowPunct w:val="true" /> <!-- Allow punctuation to overflow margins -->
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
### Kinsoku Rules (禁則処理)
|
||||
|
||||
Prevents certain characters from appearing at the start or end of a line:
|
||||
- **Cannot start a line**: `)」』】〉》。、,!?;:` and closing brackets
|
||||
- **Cannot end a line**: `(「『【〈《` and opening brackets
|
||||
|
||||
Word applies these automatically when `w:kinsoku` is enabled.
|
||||
|
||||
### Line Breaking
|
||||
|
||||
- CJK characters can break between **any two characters** (no word boundaries needed)
|
||||
- Latin words within CJK text still follow word-boundary breaking
|
||||
- `w:wordWrap w:val="false"` enables CJK-style breaking (break anywhere)
|
||||
|
||||
---
|
||||
|
||||
## Paragraph Indentation
|
||||
|
||||
### Chinese Standard: 2-Character Indent
|
||||
|
||||
Chinese body text conventionally uses a 2-character first-line indent:
|
||||
|
||||
```xml
|
||||
<w:ind w:firstLineChars="200" /> <!-- 200 = 2 characters × 100 -->
|
||||
```
|
||||
|
||||
Preferred over `w:firstLine` with fixed DXA because `firstLineChars` scales with font size.
|
||||
|
||||
| Indent | Value |
|
||||
|--------|-------|
|
||||
| 1 character | `w:firstLineChars="100"` |
|
||||
| 2 characters | `w:firstLineChars="200"` |
|
||||
| 3 characters | `w:firstLineChars="300"` |
|
||||
|
||||
---
|
||||
|
||||
## Line Spacing
|
||||
|
||||
- CJK characters are taller than Latin characters at the same point size
|
||||
- Default `1.0` line spacing may feel cramped with CJK text
|
||||
- Recommended: `1.15–1.5` for mixed CJK+Latin, `1.0` with fixed 28pt for 公文
|
||||
|
||||
### Auto Spacing
|
||||
|
||||
```xml
|
||||
<w:pPr>
|
||||
<w:autoSpaceDE w:val="true"/> <!-- auto space between CJK and Latin -->
|
||||
<w:autoSpaceDN w:val="true"/> <!-- auto space between CJK and numbers -->
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
Adds ~¼ em spacing between CJK and non-CJK characters automatically. **Recommended: always enable.**
|
||||
|
||||
---
|
||||
|
||||
## GB/T 9704
|
||||
|
||||
Chinese government document standard (党政机关公文格式). These are **strict requirements**, not suggestions.
|
||||
|
||||
### Page Setup
|
||||
|
||||
| Parameter | Value | OpenXML |
|
||||
|-----------|-------|---------|
|
||||
| Page size | A4 (210×297mm) | Width=11906, Height=16838 |
|
||||
| Top margin | 37mm | 2098 DXA |
|
||||
| Bottom margin | 35mm | 1984 DXA |
|
||||
| Left margin | 28mm | 1588 DXA |
|
||||
| Right margin | 26mm | 1474 DXA |
|
||||
| Characters/line | 28 | |
|
||||
| Lines/page | 22 | |
|
||||
| Line spacing | Fixed 28pt | `line="560"` lineRule="exact" |
|
||||
|
||||
### Document Structure
|
||||
|
||||
```
|
||||
┌─────────────────────────────────┐
|
||||
│ 发文机关标志 (红头) │ ← 小标宋 or 红色大字
|
||||
│ ══════════════════ (红线) │ ← Red #FF0000, 2pt
|
||||
├─────────────────────────────────┤
|
||||
│ 发文字号: X机发〔2025〕X号 │ ← 仿宋 三号, centered
|
||||
│ │
|
||||
│ 标题 (Title) │ ← 小标宋 二号, centered
|
||||
│ │ 可分多行,回行居中
|
||||
│ 主送机关: │ ← 仿宋 三号
|
||||
│ │
|
||||
│ 正文 (Body)... │ ← 仿宋_GB2312 三号
|
||||
│ 一、一级标题 │ ← 黑体 三号
|
||||
│ (一)二级标题 │ ← 楷体 三号
|
||||
│ 1. 三级标题 │ ← 仿宋 三号 加粗
|
||||
│ (1) 四级标题 │ ← 仿宋 三号
|
||||
│ │
|
||||
│ 附件: 1. xxx │ ← 仿宋 三号
|
||||
│ │
|
||||
│ 发文机关署名 │ ← 仿宋 三号
|
||||
│ 成文日期 │ ← 仿宋 三号, 小写中文数字
|
||||
├─────────────────────────────────┤
|
||||
│ ══════════════════ (版记线) │
|
||||
│ 抄送: xxx │ ← 仿宋 四号
|
||||
│ 印发机关及日期 │ ← 仿宋 四号
|
||||
└─────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Numbering System
|
||||
|
||||
```
|
||||
一、 ← 黑体 (SimHei), no indentation
|
||||
(一) ← 楷体 (KaiTi), indented 2 chars
|
||||
1. ← 仿宋加粗 (FangSong Bold), indented 2 chars
|
||||
(1) ← 仿宋 (FangSong), indented 2 chars
|
||||
```
|
||||
|
||||
### Colors
|
||||
|
||||
| Element | Color | Requirement |
|
||||
|---------|-------|-------------|
|
||||
| All body text | Black #000000 | Mandatory |
|
||||
| 红头 (agency name) | Red #FF0000 | Mandatory |
|
||||
| 红线 (separator) | Red #FF0000 | Mandatory |
|
||||
| 公章 (official seal) | Red | Mandatory |
|
||||
|
||||
### Page Numbers
|
||||
|
||||
- Position: bottom center
|
||||
- Format: `-X-` (dash-number-dash)
|
||||
- Font: 宋体 四号 (SimSun 14pt, `sz="28"`)
|
||||
- No page number on cover page if present
|
||||
|
||||
---
|
||||
|
||||
## Mixed Script
|
||||
|
||||
### Font Size Harmony
|
||||
|
||||
CJK characters appear larger than Latin characters at the same point size. Compensation:
|
||||
|
||||
- If body is Calibri 11pt, pair with CJK at 11pt (same size — CJK looks slightly larger but acceptable)
|
||||
- If precise visual match needed, CJK can be set 0.5–1pt smaller
|
||||
- In practice, same point size is standard — don't over-optimize
|
||||
|
||||
### Bold and Italic
|
||||
|
||||
- **Chinese/Japanese have no true italic.** Word synthesizes a slant which looks poor
|
||||
- Use **bold** for emphasis in CJK text
|
||||
- Use 着重号 (emphasis dots) for traditional emphasis: `<w:em w:val="dot"/>` on RunProperties
|
||||
|
||||
---
|
||||
|
||||
## OpenXML Quick Reference
|
||||
|
||||
### Set EastAsia Font (C#)
|
||||
|
||||
```csharp
|
||||
new Run(
|
||||
new RunProperties(
|
||||
new RunFonts { EastAsia = "SimSun", Ascii = "Calibri", HighAnsi = "Calibri" },
|
||||
new FontSize { Val = "32" } // 三号 = 16pt = sz 32
|
||||
),
|
||||
new Text("这是正文内容")
|
||||
);
|
||||
```
|
||||
|
||||
### Document Defaults (C#)
|
||||
|
||||
```csharp
|
||||
new DocDefaults(new RunPropertiesDefault(new RunPropertiesBaseStyle(
|
||||
new RunFonts {
|
||||
Ascii = "Calibri", HighAnsi = "Calibri",
|
||||
EastAsia = "Microsoft YaHei"
|
||||
},
|
||||
new Languages { Val = "en-US", EastAsia = "zh-CN" }
|
||||
)));
|
||||
```
|
||||
|
||||
### 公文 Style Definitions (C#)
|
||||
|
||||
```csharp
|
||||
// Title style — 小标宋 二号 centered
|
||||
new Style(
|
||||
new StyleName { Val = "GongWen Title" },
|
||||
new BasedOn { Val = "Normal" },
|
||||
new StyleRunProperties(
|
||||
new RunFonts { EastAsia = "FZXiaoBiaoSong-B05S" },
|
||||
new FontSize { Val = "44" }, // 二号 = 22pt
|
||||
new Bold()
|
||||
),
|
||||
new StyleParagraphProperties(
|
||||
new Justification { Val = JustificationValues.Center },
|
||||
new SpacingBetweenLines { Line = "560", LineRule = LineSpacingRuleValues.Exact }
|
||||
)
|
||||
) { Type = StyleValues.Paragraph, StyleId = "GongWenTitle" };
|
||||
|
||||
// Body style — 仿宋_GB2312 三号
|
||||
new Style(
|
||||
new StyleName { Val = "GongWen Body" },
|
||||
new StyleRunProperties(
|
||||
new RunFonts { EastAsia = "FangSong_GB2312", Ascii = "FangSong_GB2312" },
|
||||
new FontSize { Val = "32" } // 三号 = 16pt
|
||||
),
|
||||
new StyleParagraphProperties(
|
||||
new SpacingBetweenLines { Line = "560", LineRule = LineSpacingRuleValues.Exact }
|
||||
)
|
||||
) { Type = StyleValues.Paragraph, StyleId = "GongWenBody" };
|
||||
```
|
||||
|
||||
### Emphasis Dots (着重号)
|
||||
|
||||
```csharp
|
||||
new RunProperties(new Emphasis { Val = EmphasisMarkValues.Dot });
|
||||
```
|
||||
|
||||
### East Asian Text Layout
|
||||
|
||||
```xml
|
||||
<!-- Snap to grid (align CJK chars to character grid) -->
|
||||
<w:snapToGrid w:val="true"/>
|
||||
|
||||
<!-- Two-lines-in-one (双行合一) -->
|
||||
<w:eastAsianLayout w:id="1" w:combine="true"/>
|
||||
|
||||
<!-- Vertical text in a cell -->
|
||||
<w:textDirection w:val="tbRl"/>
|
||||
```
|
||||
+184
@@ -0,0 +1,184 @@
|
||||
# Chinese University Thesis Template Guide (中国高校论文模板指南)
|
||||
|
||||
## Why This Guide Exists
|
||||
|
||||
Chinese university thesis templates (.docx) have structural patterns that differ significantly
|
||||
from Western templates. Agents that assume Western conventions (Heading1/Heading2/Normal) will
|
||||
fail repeatedly. This guide documents the ACTUAL patterns found in Chinese templates.
|
||||
|
||||
## Common StyleId Patterns
|
||||
|
||||
### Pattern A: Numeric IDs (most common in Chinese Word templates)
|
||||
|
||||
| Style Purpose | styleId | w:name | w:basedOn |
|
||||
|--------------|---------|--------|-----------|
|
||||
| Normal body | `a` | "Normal" | — |
|
||||
| Default paragraph font | `a0` | "Default Paragraph Font" | — |
|
||||
| Heading 1 (章标题) | `1` | "heading 1" | `a` |
|
||||
| Heading 2 (节标题) | `2` | "heading 2" | `a` |
|
||||
| Heading 3 (小节标题) | `3` | "heading 3" | `a` |
|
||||
| TOC 1 | `11` | "toc 1" | `a` |
|
||||
| TOC 2 | `21` | "toc 2" | `a` |
|
||||
| TOC 3 | `31` | "toc 3" | `a` |
|
||||
| Header | `a3` | "header" | `a` |
|
||||
| Footer | `a4` | "footer" | `a` |
|
||||
| Table of Contents heading | `10` | "TOC Heading" | `1` |
|
||||
|
||||
### Pattern B: English IDs (less common, usually from international templates)
|
||||
Standard Heading1/Heading2/Heading3/Normal — these follow the Western pattern.
|
||||
|
||||
### Pattern C: Mixed (some Chinese, some English)
|
||||
Some templates define custom styles with Chinese names:
|
||||
| Style Purpose | styleId | w:name |
|
||||
|--------------|---------|--------|
|
||||
| 论文标题 | `lunwenbiaoti` | "论文标题" |
|
||||
| 章标题 | `zhangbiaoti` | "章标题" |
|
||||
| 正文 | `zhengwen` | "正文" |
|
||||
|
||||
### How to Identify Which Pattern
|
||||
|
||||
```bash
|
||||
# Extract all styleIds from the template
|
||||
$CLI analyze --input template.docx --styles-only
|
||||
|
||||
# Or manually:
|
||||
# unzip template.docx word/styles.xml
|
||||
# Search for w:styleId= in the extracted file
|
||||
```
|
||||
|
||||
Look at the first few styleIds. If you see `1`, `2`, `3`, `a`, `a0` → Pattern A.
|
||||
If you see `Heading1`, `Normal` → Pattern B.
|
||||
|
||||
## Standard Thesis Structure
|
||||
|
||||
Chinese university theses follow a highly standardized structure:
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────┐
|
||||
│ 封面 (Cover Page) │ ← Usually 1-2 pages
|
||||
│ - 校名、校徽 │
|
||||
│ - 论文题目 (title) │
|
||||
│ - 作者、导师、院系、日期 │
|
||||
├─────────────────────────────────────┤
|
||||
│ 学术诚信承诺书 / 独创性声明 │ ← 1 page
|
||||
│ (Academic Integrity Declaration) │
|
||||
├─────────────────────────────────────┤
|
||||
│ 中文摘要 (Chinese Abstract) │ ← 1-2 pages
|
||||
│ - "摘 要" heading │
|
||||
│ - Abstract body │
|
||||
│ - "关键词:" line │
|
||||
├─────────────────────────────────────┤
|
||||
│ 英文摘要 (English Abstract) │ ← 1-2 pages
|
||||
│ - "ABSTRACT" heading │
|
||||
│ - Abstract body │
|
||||
│ - "Keywords:" line │
|
||||
├─────────────────────────────────────┤
|
||||
│ 目录 (Table of Contents) │ ← 1-3 pages
|
||||
│ - Often inside SDT block │
|
||||
│ - Static example entries │
|
||||
│ - TOC field code │
|
||||
├─────────────────────────────────────┤
|
||||
│ 正文 (Body) │ ← Main content
|
||||
│ 第1章 绪论 │
|
||||
│ 1.1 研究背景 │
|
||||
│ 1.2 研究目的和意义 │
|
||||
│ 第2章 文献综述 │
|
||||
│ ... │
|
||||
│ 第N章 结论与展望 │
|
||||
├─────────────────────────────────────┤
|
||||
│ 参考文献 (References) │ ← Styled differently
|
||||
├─────────────────────────────────────┤
|
||||
│ 致谢 (Acknowledgments) │ ← Optional
|
||||
├─────────────────────────────────────┤
|
||||
│ 附录 (Appendices) │ ← Optional
|
||||
└─────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Identifying Zone Boundaries in Templates
|
||||
|
||||
Templates contain EXAMPLE content that must be replaced. Here's how to find the zones:
|
||||
|
||||
### Zone A (Front matter) — KEEP from template
|
||||
- Starts at: paragraph 0
|
||||
- Ends at: the paragraph BEFORE the first chapter heading
|
||||
- Contains: cover, declaration, abstracts, TOC
|
||||
- How to detect end: search for first paragraph with style `1` (or Heading1) containing "第1章" or "绪论"
|
||||
|
||||
### Zone B (Body content) — REPLACE with user content
|
||||
- Starts at: first chapter heading ("第1章...")
|
||||
- Ends at: "参考文献" heading (inclusive) or last body paragraph before acknowledgments
|
||||
- How to detect:
|
||||
```python
|
||||
for i, el in enumerate(body_elements):
|
||||
text = get_text(el)
|
||||
style = get_style(el)
|
||||
if style in ('1', 'Heading1') and ('第1章' in text or '绪论' in text):
|
||||
zone_b_start = i
|
||||
if '参考文献' in text:
|
||||
zone_b_end = i
|
||||
```
|
||||
|
||||
### Zone C (Back matter) — KEEP from template (or remove)
|
||||
- Starts after: 参考文献
|
||||
- Contains: 致谢, 附录, final sectPr
|
||||
|
||||
## Font Expectations in Chinese Thesis Templates
|
||||
|
||||
| Element | Font | Size (字号) | Size (pt) | w:sz |
|
||||
|---------|------|------------|-----------|------|
|
||||
| 论文标题 | 华文中宋 or 黑体 | 二号 or 小二 | 22pt or 18pt | 44 or 36 |
|
||||
| 章标题 (H1) | 黑体 | 三号 | 16pt | 32 |
|
||||
| 节标题 (H2) | 黑体 | 四号 | 14pt | 28 |
|
||||
| 小节标题 (H3) | 黑体 | 小四 | 12pt | 24 |
|
||||
| 正文 | 宋体 | 小四 | 12pt | 24 |
|
||||
| 页眉 | 宋体 | 五号 | 10.5pt | 21 |
|
||||
| 页脚/页码 | 宋体 | 五号 | 10.5pt | 21 |
|
||||
| 表格内容 | 宋体 | 五号 | 10.5pt | 21 |
|
||||
| 参考文献条目 | 宋体 | 五号 | 10.5pt | 21 |
|
||||
|
||||
## RunFonts for CJK Body Text
|
||||
|
||||
```xml
|
||||
<w:rFonts w:ascii="Times New Roman" w:hAnsi="Times New Roman"
|
||||
w:eastAsia="宋体" w:cs="Times New Roman"/>
|
||||
```
|
||||
|
||||
For headings:
|
||||
```xml
|
||||
<w:rFonts w:ascii="Times New Roman" w:hAnsi="Times New Roman"
|
||||
w:eastAsia="黑体" w:cs="Times New Roman"/>
|
||||
```
|
||||
|
||||
IMPORTANT: When cleaning direct formatting, ALWAYS preserve w:eastAsia.
|
||||
Removing it causes Chinese text to fall back to the wrong font.
|
||||
|
||||
## Common Mistakes with Chinese Templates
|
||||
|
||||
1. **Searching for `Heading1`** — Chinese templates use `1`, not `Heading1`
|
||||
2. **Clearing all rFonts** — Must keep eastAsia font declarations
|
||||
3. **Assuming "第1章" is the first paragraph** — It's typically paragraph 100+ after cover/abstract/TOC
|
||||
4. **Ignoring SDT blocks in TOC** — The TOC is wrapped in an SDT, not just field codes
|
||||
5. **Wrong line spacing** — Chinese theses typically use fixed 20pt (line="400") or 22pt (line="440"), not the 28pt used in government documents
|
||||
6. **Missing section breaks** — Each zone (abstract, TOC, body) usually has its own sectPr for different headers/footers
|
||||
|
||||
## Style Mapping Quick Reference
|
||||
|
||||
When source document uses Western IDs and template uses Chinese numeric IDs:
|
||||
|
||||
```json
|
||||
{
|
||||
"Heading1": "1",
|
||||
"Heading2": "2",
|
||||
"Heading3": "3",
|
||||
"Heading4": "3",
|
||||
"Normal": "a",
|
||||
"BodyText": "a",
|
||||
"ListParagraph": "a",
|
||||
"Caption": "a",
|
||||
"TOC1": "11",
|
||||
"TOC2": "21",
|
||||
"TOC3": "31"
|
||||
}
|
||||
```
|
||||
|
||||
When source uses Chinese numeric IDs and template uses Western IDs — reverse the mapping.
|
||||
@@ -0,0 +1,191 @@
|
||||
# Comments System Guide (4-File Architecture)
|
||||
|
||||
## Overview
|
||||
|
||||
Word comments require coordination across **four XML files** plus references in `document.xml`, `[Content_Types].xml`, and `document.xml.rels`.
|
||||
|
||||
---
|
||||
|
||||
## The Four Comment Files
|
||||
|
||||
### 1. `word/comments.xml` — Main Comment Content
|
||||
|
||||
Contains the actual comment text:
|
||||
|
||||
```xml
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:comments xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
|
||||
<w:comment w:id="1" w:author="Alice" w:date="2026-03-21T09:00:00Z" w:initials="A">
|
||||
<w:p>
|
||||
<w:pPr><w:pStyle w:val="CommentText" /></w:pPr>
|
||||
<w:r>
|
||||
<w:rPr><w:rStyle w:val="CommentReference" /></w:rPr>
|
||||
<w:annotationRef />
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:t>This needs clarification.</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
</w:comment>
|
||||
</w:comments>
|
||||
```
|
||||
|
||||
Key attributes: `w:id` (unique integer), `w:author`, `w:date` (ISO 8601), `w:initials`.
|
||||
|
||||
### 2. `word/commentsExtended.xml` — W15 Extensions
|
||||
|
||||
Links comments to paragraphs and tracks resolved status:
|
||||
|
||||
```xml
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w15:commentsEx xmlns:w15="http://schemas.microsoft.com/office/word/2012/wordml">
|
||||
<w15:commentEx w15:paraId="1A2B3C4D" w15:done="0" />
|
||||
</w15:commentsEx>
|
||||
```
|
||||
|
||||
- `w15:paraId` — matches the `w14:paraId` of the comment's paragraph in `comments.xml`
|
||||
- `w15:done` — `"0"` = open, `"1"` = resolved
|
||||
|
||||
### 3. `word/commentsIds.xml` — Persistent ID Mapping
|
||||
|
||||
Provides durable IDs that survive copy/paste across documents:
|
||||
|
||||
```xml
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w16cid:commentsIds xmlns:w16cid="http://schemas.microsoft.com/office/word/2016/wordml/cid">
|
||||
<w16cid:commentId w16cid:paraId="1A2B3C4D" w16cid:durableId="12345678" />
|
||||
</w16cid:commentsIds>
|
||||
```
|
||||
|
||||
- `w16cid:paraId` — same as `w15:paraId`
|
||||
- `w16cid:durableId` — globally unique identifier (8-digit hex)
|
||||
|
||||
### 4. `word/commentsExtensible.xml` — W16 Extensions
|
||||
|
||||
Modern comment extensions (used in newer Word versions):
|
||||
|
||||
```xml
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w16cex:commentsExtensible xmlns:w16cex="http://schemas.microsoft.com/office/word/2018/wordml/cex">
|
||||
<w16cex:commentExtensible w16cex:durableId="12345678" w16cex:dateUtc="2026-03-21T09:00:00Z" />
|
||||
</w16cex:commentsExtensible>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Document.xml References
|
||||
|
||||
Comments are anchored in document content using three elements:
|
||||
|
||||
```xml
|
||||
<w:p>
|
||||
<w:commentRangeStart w:id="1" />
|
||||
<w:r><w:t>This text has a comment.</w:t></w:r>
|
||||
<w:commentRangeEnd w:id="1" />
|
||||
<w:r>
|
||||
<w:rPr><w:rStyle w:val="CommentReference" /></w:rPr>
|
||||
<w:commentReference w:id="1" />
|
||||
</w:r>
|
||||
</w:p>
|
||||
```
|
||||
|
||||
- `w:commentRangeStart` — marks where the commented text begins
|
||||
- `w:commentRangeEnd` — marks where the commented text ends
|
||||
- `w:commentReference` — the visible comment marker (superscript number), placed in a run after the range end
|
||||
|
||||
The `w:id` on all three must match the `w:id` in `comments.xml`.
|
||||
|
||||
---
|
||||
|
||||
## Content Types Registration
|
||||
|
||||
Add to `[Content_Types].xml`:
|
||||
|
||||
```xml
|
||||
<Override PartName="/word/comments.xml"
|
||||
ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml" />
|
||||
<Override PartName="/word/commentsExtended.xml"
|
||||
ContentType="application/vnd.ms-word.commentsExtended+xml" />
|
||||
<Override PartName="/word/commentsIds.xml"
|
||||
ContentType="application/vnd.ms-word.commentsIds+xml" />
|
||||
<Override PartName="/word/commentsExtensible.xml"
|
||||
ContentType="application/vnd.ms-word.commentsExtensible+xml" />
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Relationship Registration
|
||||
|
||||
Add to `word/_rels/document.xml.rels`:
|
||||
|
||||
```xml
|
||||
<Relationship Id="rId20" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
|
||||
Target="comments.xml" />
|
||||
<Relationship Id="rId21" Type="http://schemas.microsoft.com/office/2011/relationships/commentsExtended"
|
||||
Target="commentsExtended.xml" />
|
||||
<Relationship Id="rId22" Type="http://schemas.microsoft.com/office/2016/09/relationships/commentsIds"
|
||||
Target="commentsIds.xml" />
|
||||
<Relationship Id="rId23" Type="http://schemas.microsoft.com/office/2018/08/relationships/commentsExtensible"
|
||||
Target="commentsExtensible.xml" />
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step-by-Step: Adding a New Comment
|
||||
|
||||
1. **Choose a unique comment ID** (scan existing `w:id` values, use max + 1)
|
||||
2. **Generate a paraId** (8-character hex, e.g., `"1A2B3C4D"`) and durableId (8-digit hex)
|
||||
3. **Add to `comments.xml`**: Create `w:comment` element with content
|
||||
4. **Add to `commentsExtended.xml`**: Create `w15:commentEx` with `paraId`, `done="0"`
|
||||
5. **Add to `commentsIds.xml`**: Create `w16cid:commentId` with `paraId` and `durableId`
|
||||
6. **Add to `commentsExtensible.xml`**: Create `w16cex:commentExtensible` with `durableId` and `dateUtc`
|
||||
7. **Add to `document.xml`**: Insert `w:commentRangeStart`, `w:commentRangeEnd`, and `w:commentReference` around target text
|
||||
8. **Verify `[Content_Types].xml`** and `document.xml.rels` have entries for all 4 files
|
||||
|
||||
---
|
||||
|
||||
## Step-by-Step: Adding a Reply
|
||||
|
||||
Replies are comments whose paragraph's `w14:paraId` links to a parent comment:
|
||||
|
||||
1. Create a new `w:comment` in `comments.xml` with a new `w:id`
|
||||
2. In `commentsExtended.xml`, add `w15:commentEx` with:
|
||||
- `w15:paraId` = new paragraph ID
|
||||
- `w15:paraIdParent` = the `paraId` of the comment being replied to
|
||||
- `w15:done="0"`
|
||||
3. Add entries in `commentsIds.xml` and `commentsExtensible.xml`
|
||||
4. In `document.xml`, the reply does NOT need its own range markers — it shares the parent's range
|
||||
|
||||
```xml
|
||||
<!-- In commentsExtended.xml -->
|
||||
<w15:commentEx w15:paraId="5E6F7A8B" w15:paraIdParent="1A2B3C4D" w15:done="0" />
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step-by-Step: Resolving a Comment
|
||||
|
||||
Set `w15:done="1"` on the comment's `w15:commentEx` entry:
|
||||
|
||||
```xml
|
||||
<!-- Before -->
|
||||
<w15:commentEx w15:paraId="1A2B3C4D" w15:done="0" />
|
||||
|
||||
<!-- After -->
|
||||
<w15:commentEx w15:paraId="1A2B3C4D" w15:done="1" />
|
||||
```
|
||||
|
||||
This marks the comment (and all its replies) as resolved. The comment remains visible but appears grayed out in Word.
|
||||
|
||||
---
|
||||
|
||||
## Minimum Viable Comment
|
||||
|
||||
At minimum, a working comment requires:
|
||||
1. `comments.xml` with the `w:comment` element
|
||||
2. `document.xml` with range markers and reference
|
||||
3. Relationship in `document.xml.rels`
|
||||
4. Content type in `[Content_Types].xml`
|
||||
|
||||
The extended files (`commentsExtended`, `commentsIds`, `commentsExtensible`) are optional but recommended for full compatibility with modern Word.
|
||||
@@ -0,0 +1,829 @@
|
||||
# GOOD vs BAD Document Design — Concrete OpenXML Examples
|
||||
|
||||
A side-by-side reference showing common design mistakes and their fixes, with exact OpenXML parameter values. Use this to develop an intuitive sense of what makes a document look professional versus amateur.
|
||||
|
||||
Format: Each comparison shows the **BAD** version first (the mistake), then the **GOOD** version (the fix), with OpenXML markup and a short explanation.
|
||||
|
||||
---
|
||||
|
||||
## 1. Font Size Disasters
|
||||
|
||||
### 1a. No Hierarchy — Everything the Same Size
|
||||
|
||||
**BAD: Body=12pt, H1=12pt bold**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│ INTRODUCTION │ ← 12pt bold... same visual weight
|
||||
│ This is the body text of the │ ← 12pt regular
|
||||
│ report. It discusses findings │
|
||||
│ from the quarterly review. │
|
||||
│ METHODOLOGY │ ← Where does the section start?
|
||||
│ We collected data from three │
|
||||
│ sources across the enterprise. │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<!-- H1: bold but same size as body — no visual separation -->
|
||||
<w:rPr><w:b/><w:sz w:val="24"/></w:rPr>
|
||||
<!-- Body -->
|
||||
<w:rPr><w:sz w:val="24"/></w:rPr>
|
||||
```
|
||||
|
||||
**GOOD: Modular scale — body=11pt, H3=13pt, H2=16pt, H1=20pt**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│ │
|
||||
│ Introduction │ ← 20pt, clearly a title
|
||||
│ │
|
||||
│ This is the body text of the │ ← 11pt, comfortable reading size
|
||||
│ report. It discusses findings │
|
||||
│ from the quarterly review. │
|
||||
│ │
|
||||
│ Methodology │ ← 20pt, section break is obvious
|
||||
│ │
|
||||
│ We collected data from three │
|
||||
│ sources across the enterprise. │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<!-- H1: 20pt = w:sz 40 -->
|
||||
<w:rPr><w:rFonts w:ascii="Calibri Light"/><w:sz w:val="40"/></w:rPr>
|
||||
<!-- H2: 16pt = w:sz 32 -->
|
||||
<w:rPr><w:rFonts w:ascii="Calibri Light"/><w:sz w:val="32"/></w:rPr>
|
||||
<!-- H3: 13pt = w:sz 26, bold -->
|
||||
<w:rPr><w:rFonts w:ascii="Calibri"/><w:b/><w:sz w:val="26"/></w:rPr>
|
||||
<!-- Body: 11pt = w:sz 22 -->
|
||||
<w:rPr><w:rFonts w:ascii="Calibri"/><w:sz w:val="22"/></w:rPr>
|
||||
```
|
||||
**Why better:** A clear size progression (ratio ~1.25x per step) lets readers instantly identify structure without reading a word.
|
||||
|
||||
---
|
||||
|
||||
### 1b. Too Much Contrast — Children's Book Look
|
||||
|
||||
**BAD: H1=28pt with body=10pt (ratio 2.8x)**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│ │
|
||||
│ QUARTERLY REPORT │ ← 28pt, dominates the page
|
||||
│ │
|
||||
│ This is body text set very small │ ← 10pt, straining to read
|
||||
│ and the contrast with the title │
|
||||
│ makes it feel like a poster. │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<w:rPr><w:b/><w:sz w:val="56"/></w:rPr> <!-- 28pt heading -->
|
||||
<w:rPr><w:sz w:val="20"/></w:rPr> <!-- 10pt body -->
|
||||
```
|
||||
|
||||
**GOOD: H1=20pt with body=11pt (ratio ~1.8x)**
|
||||
```xml
|
||||
<w:rPr><w:sz w:val="40"/></w:rPr> <!-- 20pt heading -->
|
||||
<w:rPr><w:sz w:val="22"/></w:rPr> <!-- 11pt body -->
|
||||
```
|
||||
**Why better:** A heading-to-body ratio between 1.5x and 2.0x reads as "structured" rather than "shouting."
|
||||
|
||||
---
|
||||
|
||||
## 2. Spacing Crimes
|
||||
|
||||
### 2a. Wall of Text — No Paragraph or Line Spacing
|
||||
|
||||
**BAD: Single line spacing, 0pt between paragraphs**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│The findings indicate a strong │
|
||||
│correlation between training hours│
|
||||
│and performance metrics. │
|
||||
│Further analysis revealed that │ ← No gap — where does the new
|
||||
│departments with higher budgets │ paragraph start?
|
||||
│achieved better outcomes in all │
|
||||
│measured categories. │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<w:pPr>
|
||||
<w:spacing w:line="240" w:lineRule="auto"/> <!-- 1.0 spacing (240/240) -->
|
||||
<w:spacing w:after="0"/> <!-- no paragraph gap -->
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
**GOOD: 1.15x line spacing, 8pt after each paragraph**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│The findings indicate a strong │
|
||||
│correlation between training │ ← Slightly more air between lines
|
||||
│hours and performance metrics. │
|
||||
│ │ ← 8pt gap signals new paragraph
|
||||
│Further analysis revealed that │
|
||||
│departments with higher budgets │
|
||||
│achieved better outcomes in all │
|
||||
│measured categories. │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<w:pPr>
|
||||
<w:spacing w:line="276" w:lineRule="auto"/> <!-- 1.15x (276/240) -->
|
||||
<w:spacing w:after="160"/> <!-- 8pt = 160 twips -->
|
||||
</w:pPr>
|
||||
```
|
||||
**Why better:** Line spacing gives each line room to breathe; paragraph spacing separates ideas without wasting a full blank line.
|
||||
|
||||
---
|
||||
|
||||
### 2b. Floating Headings — Same Space Above and Below
|
||||
|
||||
**BAD: 12pt before and 12pt after heading**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│ ...end of previous section. │
|
||||
│ │ ← 12pt gap
|
||||
│ Section Two │ ← Heading floats in the middle
|
||||
│ │ ← 12pt gap
|
||||
│ Start of section two content. │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<w:pPr>
|
||||
<w:spacing w:before="240" w:after="240"/> <!-- 12pt both sides -->
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
**GOOD: 24pt before, 8pt after heading**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│ ...end of previous section. │
|
||||
│ │
|
||||
│ │ ← 24pt gap — clear section break
|
||||
│ Section Two │ ← Heading is close to its content
|
||||
│ │ ← 8pt gap
|
||||
│ Start of section two content. │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<w:pPr>
|
||||
<w:spacing w:before="480" w:after="160"/> <!-- 24pt before, 8pt after -->
|
||||
</w:pPr>
|
||||
```
|
||||
**Why better:** Proximity principle: a heading belongs to the text that follows it, so more space above and less space below anchors it to its content.
|
||||
|
||||
---
|
||||
|
||||
### 2c. Wasteful Gaps — Huge Spacing Everywhere
|
||||
|
||||
**BAD: 24pt after every paragraph, including body text**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│ First paragraph of text here. │
|
||||
│ │
|
||||
│ │ ← 24pt gap after every paragraph
|
||||
│ │
|
||||
│ Second paragraph of text here. │
|
||||
│ │
|
||||
│ │
|
||||
│ │
|
||||
│ Third paragraph. │ ← Document looks mostly white space
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<w:spacing w:after="480"/> <!-- 24pt = 480 twips after every paragraph -->
|
||||
```
|
||||
|
||||
**GOOD: Proportional spacing — body=8pt, H2=6pt after, H1=10pt after**
|
||||
```xml
|
||||
<!-- Body paragraph -->
|
||||
<w:spacing w:after="160"/> <!-- 8pt after body -->
|
||||
<!-- H1 -->
|
||||
<w:spacing w:before="480" w:after="200"/> <!-- 24pt before, 10pt after -->
|
||||
<!-- H2 -->
|
||||
<w:spacing w:before="320" w:after="120"/> <!-- 16pt before, 6pt after -->
|
||||
```
|
||||
**Why better:** Spacing should vary by element role, creating a visual rhythm rather than uniform gaps.
|
||||
|
||||
---
|
||||
|
||||
## 3. Margin Mistakes
|
||||
|
||||
### 3a. Cramped Margins — Text Running to the Edge
|
||||
|
||||
**BAD: 0.5in margins all around**
|
||||
```
|
||||
┌────────────────────────────────────────────────┐
|
||||
│Text starts almost at the paper edge and runs │
|
||||
│all the way across making extremely long lines │
|
||||
│that are hard to track from end back to start. │
|
||||
│The eye loses its place on every line return. │
|
||||
└────────────────────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<w:pgMar w:top="720" w:right="720" w:bottom="720" w:left="720"/>
|
||||
<!-- 720 twips = 0.5in — line length ~7.5in on letter paper -->
|
||||
```
|
||||
|
||||
**GOOD: 1in margins (standard)**
|
||||
```xml
|
||||
<w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440"/>
|
||||
<!-- 1440 twips = 1.0in — line length ~6.5in, ideal for 11pt body -->
|
||||
```
|
||||
**Why better:** Optimal line length is 60-75 characters. At 11pt Calibri, 6.5in width achieves roughly 70 characters per line.
|
||||
|
||||
---
|
||||
|
||||
### 3b. Over-Padded Margins — Looks Like the Content is Hiding
|
||||
|
||||
**BAD: 2in margins on a short document**
|
||||
```xml
|
||||
<w:pgMar w:top="2880" w:right="2880" w:bottom="2880" w:left="2880"/>
|
||||
<!-- 2880 twips = 2.0in — only 4.5in of text width, looks padded -->
|
||||
```
|
||||
|
||||
**GOOD: 1in standard, or 1.25in for formal documents**
|
||||
```xml
|
||||
<!-- Standard -->
|
||||
<w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440"/>
|
||||
<!-- Formal / bound documents with gutter -->
|
||||
<w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1800" w:gutter="0"/>
|
||||
<!-- 1800 twips = 1.25in left for binding margin -->
|
||||
```
|
||||
**Why better:** Margins should frame the content, not overwhelm it. 1-1.25in works for virtually all business and academic documents.
|
||||
|
||||
---
|
||||
|
||||
## 4. Table Ugliness
|
||||
|
||||
### 4a. Prison Grid — Full Borders on Every Cell
|
||||
|
||||
**BAD: Every cell with 1pt borders on all four sides**
|
||||
```
|
||||
┌───────┬───────┬───────┬───────┐
|
||||
│ Name │ Dept │ Score │ Grade │
|
||||
├───────┼───────┼───────┼───────┤
|
||||
│ Alice │ Eng │ 92 │ A │
|
||||
├───────┼───────┼───────┼───────┤
|
||||
│ Bob │ Sales │ 85 │ B │
|
||||
├───────┼───────┼───────┼───────┤
|
||||
│ Carol │ Eng │ 78 │ C+ │
|
||||
└───────┴───────┴───────┴───────┘
|
||||
```
|
||||
```xml
|
||||
<w:tcBorders>
|
||||
<w:top w:val="single" w:sz="4" w:color="000000"/>
|
||||
<w:left w:val="single" w:sz="4" w:color="000000"/>
|
||||
<w:bottom w:val="single" w:sz="4" w:color="000000"/>
|
||||
<w:right w:val="single" w:sz="4" w:color="000000"/>
|
||||
</w:tcBorders>
|
||||
```
|
||||
|
||||
**GOOD: Three-line table (三线表) — top thick, header-bottom medium, table-bottom thick**
|
||||
```
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ ← 1.5pt top border
|
||||
Name Dept Score Grade
|
||||
────────────────────────────────── ← 0.75pt header separator
|
||||
Alice Eng 92 A
|
||||
Bob Sales 85 B
|
||||
Carol Eng 78 C+
|
||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ ← 1.5pt bottom border
|
||||
```
|
||||
```xml
|
||||
<!-- Top border of header row cells -->
|
||||
<w:top w:val="single" w:sz="12" w:color="000000"/> <!-- 1.5pt -->
|
||||
<w:left w:val="nil"/><w:right w:val="nil"/>
|
||||
<w:bottom w:val="single" w:sz="6" w:color="000000"/> <!-- 0.75pt -->
|
||||
|
||||
<!-- Data row cells: no left/right/top borders -->
|
||||
<w:top w:val="nil"/><w:left w:val="nil"/><w:right w:val="nil"/>
|
||||
<w:bottom w:val="nil"/>
|
||||
|
||||
<!-- Last row bottom border -->
|
||||
<w:bottom w:val="single" w:sz="12" w:color="000000"/> <!-- 1.5pt -->
|
||||
```
|
||||
**Why better:** Removing inner borders lets the eye scan data freely. Three lines provide structure without visual clutter.
|
||||
|
||||
---
|
||||
|
||||
### 4b. Text Touching Borders — No Cell Padding
|
||||
|
||||
**BAD: Zero cell margins**
|
||||
```
|
||||
┌──────────┬──────────┐
|
||||
│Name │Department│ ← Text cramped against borders
|
||||
├──────────┼──────────┤
|
||||
│Alice │Engineering│
|
||||
└──────────┴──────────┘
|
||||
```
|
||||
```xml
|
||||
<w:tcMar>
|
||||
<w:top w:w="0" w:type="dxa"/>
|
||||
<w:start w:w="0" w:type="dxa"/>
|
||||
<w:bottom w:w="0" w:type="dxa"/>
|
||||
<w:end w:w="0" w:type="dxa"/>
|
||||
</w:tcMar>
|
||||
```
|
||||
|
||||
**GOOD: 0.08in vertical, 0.12in horizontal padding**
|
||||
```xml
|
||||
<w:tcMar>
|
||||
<w:top w:w="115" w:type="dxa"/> <!-- ~0.08in = 115 twips -->
|
||||
<w:start w:w="173" w:type="dxa"/> <!-- ~0.12in = 173 twips -->
|
||||
<w:bottom w:w="115" w:type="dxa"/>
|
||||
<w:end w:w="173" w:type="dxa"/>
|
||||
</w:tcMar>
|
||||
```
|
||||
**Why better:** Padding gives text breathing room inside cells, making every value easier to read.
|
||||
|
||||
---
|
||||
|
||||
### 4c. Invisible Headers — Header Row Same Style as Data
|
||||
|
||||
**BAD: Header row indistinguishable from data**
|
||||
```xml
|
||||
<!-- Header cell run properties — identical to data -->
|
||||
<w:rPr><w:sz w:val="22"/></w:rPr>
|
||||
```
|
||||
|
||||
**GOOD: Bold header text, subtle background fill, bottom border**
|
||||
```xml
|
||||
<!-- Header cell run properties -->
|
||||
<w:rPr><w:b/><w:sz w:val="22"/><w:color w:val="333333"/></w:rPr>
|
||||
|
||||
<!-- Header cell shading -->
|
||||
<w:tcPr>
|
||||
<w:shd w:val="clear" w:color="auto" w:fill="F2F2F2"/> <!-- light gray bg -->
|
||||
<w:tcBorders>
|
||||
<w:bottom w:val="single" w:sz="8" w:color="666666"/> <!-- 1pt separator -->
|
||||
</w:tcBorders>
|
||||
</w:tcPr>
|
||||
|
||||
<!-- Mark row as header (repeats on page break) -->
|
||||
<w:trPr><w:tblHeader/></w:trPr>
|
||||
```
|
||||
**Why better:** Distinct header styling lets readers instantly locate column meanings, especially in long tables that span pages. The `w:tblHeader` element ensures the header row repeats on every page.
|
||||
|
||||
---
|
||||
|
||||
## 5. Font Pairing Failures
|
||||
|
||||
### 5a. Visual Chaos — Too Many Fonts
|
||||
|
||||
**BAD: 4+ fonts in one document**
|
||||
```xml
|
||||
<!-- H1 in Impact -->
|
||||
<w:rPr><w:rFonts w:ascii="Impact"/><w:sz w:val="40"/></w:rPr>
|
||||
<!-- H2 in Georgia -->
|
||||
<w:rPr><w:rFonts w:ascii="Georgia"/><w:sz w:val="32"/></w:rPr>
|
||||
<!-- Body in Verdana -->
|
||||
<w:rPr><w:rFonts w:ascii="Verdana"/><w:sz w:val="22"/></w:rPr>
|
||||
<!-- Captions in Courier New -->
|
||||
<w:rPr><w:rFonts w:ascii="Courier New"/><w:sz w:val="18"/></w:rPr>
|
||||
```
|
||||
|
||||
**GOOD: One font family with weight variation, or two complementary families**
|
||||
```xml
|
||||
<!-- H1: Calibri Light (thin weight of Calibri family) -->
|
||||
<w:rPr><w:rFonts w:ascii="Calibri Light"/><w:sz w:val="40"/></w:rPr>
|
||||
<!-- H2: Calibri Light -->
|
||||
<w:rPr><w:rFonts w:ascii="Calibri Light"/><w:sz w:val="32"/></w:rPr>
|
||||
<!-- Body: Calibri (regular weight) -->
|
||||
<w:rPr><w:rFonts w:ascii="Calibri"/><w:sz w:val="22"/></w:rPr>
|
||||
<!-- Captions: Calibri -->
|
||||
<w:rPr><w:rFonts w:ascii="Calibri"/><w:sz w:val="18"/></w:rPr>
|
||||
```
|
||||
**Why better:** Limiting to one or two font families creates visual coherence. Vary by size and weight, not by font.
|
||||
|
||||
---
|
||||
|
||||
### 5b. Mismatched Personality — Comic Sans Meets Times New Roman
|
||||
|
||||
**BAD:**
|
||||
```xml
|
||||
<w:rPr><w:rFonts w:ascii="Comic Sans MS"/><w:sz w:val="36"/></w:rPr> <!-- heading -->
|
||||
<w:rPr><w:rFonts w:ascii="Times New Roman"/><w:sz w:val="24"/></w:rPr> <!-- body -->
|
||||
```
|
||||
|
||||
**GOOD: Fonts with compatible character**
|
||||
```xml
|
||||
<w:rPr><w:rFonts w:ascii="Calibri Light"/><w:sz w:val="36"/></w:rPr> <!-- heading -->
|
||||
<w:rPr><w:rFonts w:ascii="Calibri"/><w:sz w:val="22"/></w:rPr> <!-- body -->
|
||||
```
|
||||
**Why better:** Paired fonts should share a similar level of formality and geometric character. Comic Sans is playful/informal; Times New Roman is formal/traditional. They clash.
|
||||
|
||||
---
|
||||
|
||||
### 5c. Everything Bold — Nothing Stands Out
|
||||
|
||||
**BAD: Bold on body, headings, captions, everything**
|
||||
```xml
|
||||
<w:rPr><w:b/><w:sz w:val="40"/></w:rPr> <!-- heading: bold -->
|
||||
<w:rPr><w:b/><w:sz w:val="22"/></w:rPr> <!-- body: also bold -->
|
||||
<w:rPr><w:b/><w:sz w:val="18"/></w:rPr> <!-- caption: still bold -->
|
||||
```
|
||||
|
||||
**GOOD: Bold reserved for headings and key terms only**
|
||||
```xml
|
||||
<w:rPr><w:b/><w:sz w:val="40"/></w:rPr> <!-- H1: bold -->
|
||||
<w:rPr><w:sz w:val="32"/></w:rPr> <!-- H2: size alone is enough -->
|
||||
<w:rPr><w:sz w:val="22"/></w:rPr> <!-- body: regular weight -->
|
||||
<w:rPr><w:b/><w:sz w:val="22"/></w:rPr> <!-- key term inline: bold -->
|
||||
<w:rPr><w:sz w:val="18"/></w:rPr> <!-- caption: regular, small -->
|
||||
```
|
||||
**Why better:** When everything is emphasized, nothing is emphasized. Bold should be a signal, not a default.
|
||||
|
||||
---
|
||||
|
||||
## 6. Color Abuse
|
||||
|
||||
### 6a. Rainbow Headings
|
||||
|
||||
**BAD: Each heading level a different bright color**
|
||||
```xml
|
||||
<w:rPr><w:color w:val="FF0000"/><w:sz w:val="40"/></w:rPr> <!-- H1: red -->
|
||||
<w:rPr><w:color w:val="00AA00"/><w:sz w:val="32"/></w:rPr> <!-- H2: green -->
|
||||
<w:rPr><w:color w:val="0000FF"/><w:sz w:val="26"/></w:rPr> <!-- H3: blue -->
|
||||
```
|
||||
|
||||
**GOOD: Single accent color for headings, black or dark gray for body**
|
||||
```xml
|
||||
<!-- All headings use the same muted accent -->
|
||||
<w:rPr><w:color w:val="1F4E79"/><w:sz w:val="40"/></w:rPr> <!-- H1: dark blue -->
|
||||
<w:rPr><w:color w:val="1F4E79"/><w:sz w:val="32"/></w:rPr> <!-- H2: same blue -->
|
||||
<w:rPr><w:color w:val="1F4E79"/><w:sz w:val="26"/></w:rPr> <!-- H3: same blue -->
|
||||
<!-- Body in near-black -->
|
||||
<w:rPr><w:color w:val="333333"/><w:sz w:val="22"/></w:rPr>
|
||||
```
|
||||
**Why better:** A single accent color establishes brand consistency. Multiple bright colors compete for attention and look unprofessional.
|
||||
|
||||
---
|
||||
|
||||
### 6b. Low Contrast — Light Gray on White
|
||||
|
||||
**BAD: #CCCCCC text on white background**
|
||||
```xml
|
||||
<w:rPr><w:color w:val="CCCCCC"/></w:rPr>
|
||||
<!-- Contrast ratio: ~1.6:1 — fails WCAG AA (minimum 4.5:1) -->
|
||||
```
|
||||
|
||||
**GOOD: #333333 text on white**
|
||||
```xml
|
||||
<w:rPr><w:color w:val="333333"/></w:rPr>
|
||||
<!-- Contrast ratio: ~12:1 — passes WCAG AAA -->
|
||||
```
|
||||
**Why better:** Sufficient contrast is not just an accessibility requirement; it makes text physically easier to read for everyone, especially in printed documents.
|
||||
|
||||
---
|
||||
|
||||
### 6c. Bright Body Text
|
||||
|
||||
**BAD: Body text in a saturated color**
|
||||
```xml
|
||||
<w:rPr><w:color w:val="0066FF"/><w:sz w:val="22"/></w:rPr> <!-- blue body text -->
|
||||
```
|
||||
|
||||
**GOOD: Color reserved for headings and inline accents only**
|
||||
```xml
|
||||
<!-- Body: neutral dark -->
|
||||
<w:rPr><w:color w:val="333333"/><w:sz w:val="22"/></w:rPr>
|
||||
<!-- Hyperlink: color is functional here -->
|
||||
<w:rPr><w:color w:val="0563C1"/><w:u w:val="single"/></w:rPr>
|
||||
```
|
||||
**Why better:** Colored body text causes eye fatigue over long reading. Reserve color for elements that need to attract attention (headings, links, warnings).
|
||||
|
||||
---
|
||||
|
||||
## 7. List Formatting Issues
|
||||
|
||||
### 7a. Bullet at the Margin — No Indent
|
||||
|
||||
**BAD: List items start at the left margin**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│Here is a paragraph of text. │
|
||||
│• First item │ ← Bullet at margin, no indent
|
||||
│• Second item │
|
||||
│• Third item │
|
||||
│Next paragraph continues here. │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<w:pPr>
|
||||
<w:ind w:left="0" w:hanging="0"/>
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
**GOOD: 0.25in left indent with hanging indent for the bullet**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│Here is a paragraph of text. │
|
||||
│ • First item │ ← Indented, clearly a list
|
||||
│ • Second item │
|
||||
│ • Third item │
|
||||
│Next paragraph continues here. │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<w:pPr>
|
||||
<w:ind w:left="360" w:hanging="360"/> <!-- 0.25in = 360 twips -->
|
||||
<w:numPr>
|
||||
<w:ilvl w:val="0"/>
|
||||
<w:numId w:val="1"/>
|
||||
</w:numPr>
|
||||
</w:pPr>
|
||||
```
|
||||
For nested lists, increment by 360 twips per level:
|
||||
```xml
|
||||
<!-- Level 1 -->
|
||||
<w:ind w:left="720" w:hanging="360"/> <!-- 0.5in left -->
|
||||
<!-- Level 2 -->
|
||||
<w:ind w:left="1080" w:hanging="360"/> <!-- 0.75in left -->
|
||||
```
|
||||
**Why better:** Indentation visually separates lists from body text and makes nesting levels clear.
|
||||
|
||||
---
|
||||
|
||||
### 7b. List Items with Full Paragraph Spacing
|
||||
|
||||
**BAD: List items have the same 8-10pt spacing as body paragraphs**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│ • First item │
|
||||
│ │ ← 10pt gap — looks like separate
|
||||
│ • Second item │ paragraphs, not a list
|
||||
│ │
|
||||
│ • Third item │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<w:spacing w:after="200"/> <!-- 10pt after each list item -->
|
||||
```
|
||||
|
||||
**GOOD: Tight spacing between list items (2-4pt)**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│ • First item │
|
||||
│ • Second item │ ← 2pt gap — cohesive list
|
||||
│ • Third item │
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<w:spacing w:after="40" w:line="276" w:lineRule="auto"/> <!-- 2pt after -->
|
||||
<!-- Or 4pt: -->
|
||||
<w:spacing w:after="80"/>
|
||||
```
|
||||
**Why better:** Tight spacing groups list items as a single unit, matching how readers expect a list to behave.
|
||||
|
||||
---
|
||||
|
||||
## 8. Header/Footer Problems
|
||||
|
||||
### 8a. Header Text Too Large — Competes with Body
|
||||
|
||||
**BAD: Header in 12pt, same as body**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│ Quarterly Report - Q3 2025 │ ← 12pt header, same as body
|
||||
│──────────────────────────────────│
|
||||
│ Introduction │
|
||||
│ This is the body text... │ ← 12pt body — header distracts
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<!-- Header paragraph -->
|
||||
<w:rPr><w:sz w:val="24"/></w:rPr> <!-- 12pt, same as body -->
|
||||
```
|
||||
|
||||
**GOOD: Header in 9pt, gray color, subtle**
|
||||
```
|
||||
┌──────────────────────────────────┐
|
||||
│ Quarterly Report - Q3 2025 │ ← 9pt, gray — present but quiet
|
||||
│──────────────────────────────────│
|
||||
│ Introduction │
|
||||
│ This is the body text... │ ← Body stands out as primary
|
||||
└──────────────────────────────────┘
|
||||
```
|
||||
```xml
|
||||
<!-- Header paragraph -->
|
||||
<w:rPr>
|
||||
<w:sz w:val="18"/> <!-- 9pt -->
|
||||
<w:color w:val="808080"/> <!-- medium gray -->
|
||||
</w:rPr>
|
||||
<w:pPr>
|
||||
<w:pBdr>
|
||||
<w:bottom w:val="single" w:sz="4" w:color="D9D9D9"/> <!-- subtle separator -->
|
||||
</w:pBdr>
|
||||
</w:pPr>
|
||||
```
|
||||
**Why better:** Headers are reference information, not primary content. They should be legible but visually subordinate.
|
||||
|
||||
---
|
||||
|
||||
### 8b. No Page Numbers on a Long Document
|
||||
|
||||
**BAD: 20-page document with no page numbers**
|
||||
```xml
|
||||
<!-- Footer section: empty or missing -->
|
||||
```
|
||||
|
||||
**GOOD: Page numbers in footer, right-aligned or centered**
|
||||
```xml
|
||||
<!-- Footer paragraph with page number field -->
|
||||
<w:p>
|
||||
<w:pPr>
|
||||
<w:jc w:val="center"/>
|
||||
<w:rPr><w:sz w:val="18"/><w:color w:val="808080"/></w:rPr>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr><w:sz w:val="18"/><w:color w:val="808080"/></w:rPr>
|
||||
<w:fldChar w:fldCharType="begin"/>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:instrText> PAGE </w:instrText>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:fldChar w:fldCharType="separate"/>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:t>1</w:t>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:fldChar w:fldCharType="end"/>
|
||||
</w:r>
|
||||
</w:p>
|
||||
```
|
||||
**Why better:** Page numbers are essential for navigation in any document over ~3 pages. Readers need to reference specific pages, and printed documents need an ordering mechanism.
|
||||
|
||||
---
|
||||
|
||||
## 9. CJK-Specific Mistakes
|
||||
|
||||
### 9a. Using Italic for Chinese Emphasis
|
||||
|
||||
**BAD: Italic applied to Chinese text**
|
||||
```xml
|
||||
<w:rPr>
|
||||
<w:i/>
|
||||
<w:rFonts w:eastAsia="SimSun"/>
|
||||
<w:sz w:val="24"/>
|
||||
</w:rPr>
|
||||
```
|
||||
CJK glyphs have no true italic form. The renderer applies a synthetic slant that looks broken and ugly — characters appear to lean awkwardly.
|
||||
|
||||
**GOOD: Use bold or emphasis dots (着重号) for Chinese emphasis**
|
||||
```xml
|
||||
<!-- Option A: Bold emphasis -->
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:rFonts w:eastAsia="SimHei"/> <!-- Switch to bold-capable font -->
|
||||
<w:sz w:val="24"/>
|
||||
</w:rPr>
|
||||
|
||||
<!-- Option B: Emphasis marks (dots under characters) -->
|
||||
<w:rPr>
|
||||
<w:em w:val="dot"/>
|
||||
<w:rFonts w:eastAsia="SimSun"/>
|
||||
<w:sz w:val="24"/>
|
||||
</w:rPr>
|
||||
```
|
||||
**Why better:** Chinese typography has its own emphasis traditions. Bold and emphasis dots are native CJK conventions; italic is a Latin-script concept that does not translate.
|
||||
|
||||
---
|
||||
|
||||
### 9b. Latin Font for Chinese Characters
|
||||
|
||||
**BAD: Only ASCII font set, no EastAsia font specified**
|
||||
```xml
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Arial"/> <!-- No eastAsia attribute -->
|
||||
<w:sz w:val="24"/>
|
||||
</w:rPr>
|
||||
<!-- Word falls back to a random font. Chinese characters may render
|
||||
with wrong metrics, inconsistent stroke widths, or missing glyphs. -->
|
||||
```
|
||||
|
||||
**GOOD: Explicit EastAsia font alongside ASCII font**
|
||||
```xml
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" w:eastAsia="Microsoft YaHei"/>
|
||||
<w:sz w:val="22"/>
|
||||
</w:rPr>
|
||||
```
|
||||
For formal/academic Chinese documents:
|
||||
```xml
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Times New Roman" w:hAnsi="Times New Roman"
|
||||
w:eastAsia="SimSun"/>
|
||||
<w:sz w:val="24"/> <!-- 小四 12pt -->
|
||||
</w:rPr>
|
||||
```
|
||||
**Why better:** Setting `w:eastAsia` ensures Chinese characters render in a font designed for CJK glyphs, with correct stroke widths, spacing, and metrics.
|
||||
|
||||
---
|
||||
|
||||
### 9c. English Line Spacing for Dense CJK Text
|
||||
|
||||
**BAD: 1.15x line spacing for Chinese body text**
|
||||
```xml
|
||||
<w:spacing w:line="276" w:lineRule="auto"/> <!-- 1.15x — too tight for CJK -->
|
||||
```
|
||||
CJK characters are taller and denser than Latin letters. At 1.15x, lines of Chinese text feel cramped and hard to read.
|
||||
|
||||
**GOOD: 1.5x line spacing or fixed 28pt for CJK body at 12pt (小四)**
|
||||
```xml
|
||||
<!-- Option A: 1.5x proportional -->
|
||||
<w:spacing w:line="360" w:lineRule="auto"/> <!-- 360/240 = 1.5x -->
|
||||
|
||||
<!-- Option B: Fixed 28pt (standard for 小四/12pt CJK body) -->
|
||||
<w:spacing w:line="560" w:lineRule="exact"/> <!-- 28pt = 560 twips -->
|
||||
```
|
||||
For 公文 (government documents) at 三号/16pt body:
|
||||
```xml
|
||||
<w:spacing w:line="580" w:lineRule="exact"/> <!-- 29pt fixed line spacing -->
|
||||
```
|
||||
**Why better:** CJK characters occupy a full em square with no ascenders/descenders providing natural gaps. Extra line spacing compensates, improving readability of dense text blocks.
|
||||
|
||||
---
|
||||
|
||||
## 10. Overall Document Feel
|
||||
|
||||
### Student Homework vs Professional Document
|
||||
|
||||
**BAD: "Student homework" — every setting is Word's default, no intentional choices**
|
||||
```xml
|
||||
<!-- Default everything: Calibri 11pt, no heading styles, 1.08 spacing -->
|
||||
<w:rPr><w:rFonts w:ascii="Calibri"/><w:sz w:val="22"/></w:rPr>
|
||||
<w:pPr><w:spacing w:after="160" w:line="259" w:lineRule="auto"/></w:pPr>
|
||||
<!-- Headings: just bold body text, no style applied -->
|
||||
<w:rPr><w:b/><w:sz w:val="22"/></w:rPr>
|
||||
<!-- No section breaks, no headers/footers, no page numbers -->
|
||||
<!-- Tables with default full grid borders -->
|
||||
<!-- No intentional color or spacing variations -->
|
||||
```
|
||||
|
||||
**GOOD: Intentional design at every level**
|
||||
```xml
|
||||
<!-- Theme fonts defined -->
|
||||
<w:rFonts w:asciiTheme="minorHAnsi" w:hAnsiTheme="minorHAnsi"/>
|
||||
|
||||
<!-- H1: Calibri Light 20pt, dark blue, generous spacing -->
|
||||
<w:pPr>
|
||||
<w:pStyle w:val="Heading1"/>
|
||||
<w:spacing w:before="480" w:after="200"/>
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light"/>
|
||||
<w:color w:val="1F4E79"/>
|
||||
<w:sz w:val="40"/>
|
||||
</w:rPr>
|
||||
|
||||
<!-- H2: Calibri Light 16pt, same blue -->
|
||||
<w:pPr>
|
||||
<w:pStyle w:val="Heading2"/>
|
||||
<w:spacing w:before="320" w:after="120"/>
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri Light"/>
|
||||
<w:color w:val="1F4E79"/>
|
||||
<w:sz w:val="32"/>
|
||||
</w:rPr>
|
||||
|
||||
<!-- Body: Calibri 11pt, dark gray, 1.15 spacing, 8pt after -->
|
||||
<w:pPr>
|
||||
<w:spacing w:after="160" w:line="276" w:lineRule="auto"/>
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri"/>
|
||||
<w:color w:val="333333"/>
|
||||
<w:sz w:val="22"/>
|
||||
</w:rPr>
|
||||
|
||||
<!-- Tables: three-line style, padded cells, repeated headers -->
|
||||
<!-- Headers/footers: 9pt gray with page numbers -->
|
||||
<!-- Margins: 1in all around -->
|
||||
<w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440"/>
|
||||
```
|
||||
**Why better:** Professional documents result from deliberate, consistent choices across all design dimensions. Each element reinforces the same visual language. The reader may not consciously notice good typography, but they feel the difference in credibility and readability.
|
||||
|
||||
---
|
||||
|
||||
## Quick Reference: Safe Defaults
|
||||
|
||||
A cheat sheet of values that produce a professional result for most Western business documents:
|
||||
|
||||
| Element | Value | OpenXML |
|
||||
|---------|-------|---------|
|
||||
| Body font | Calibri 11pt | `w:sz="22"` |
|
||||
| H1 | Calibri Light 20pt | `w:sz="40"` |
|
||||
| H2 | Calibri Light 16pt | `w:sz="32"` |
|
||||
| H3 | Calibri 13pt bold | `w:sz="26"`, `w:b` |
|
||||
| Body color | #333333 | `w:color="333333"` |
|
||||
| Heading color | #1F4E79 | `w:color="1F4E79"` |
|
||||
| Line spacing | 1.15x | `w:line="276" w:lineRule="auto"` |
|
||||
| Para spacing after | 8pt | `w:after="160"` |
|
||||
| H1 spacing | 24pt before, 10pt after | `w:before="480" w:after="200"` |
|
||||
| H2 spacing | 16pt before, 6pt after | `w:before="320" w:after="120"` |
|
||||
| Margins | 1in all around | `w:pgMar` all `"1440"` |
|
||||
| Table cell padding | 0.08in / 0.12in | `w:w="115"` / `w:w="173"` |
|
||||
| Header/footer size | 9pt gray | `w:sz="18" w:color="808080"` |
|
||||
| List indent | 0.25in per level | `w:left="360" w:hanging="360"` |
|
||||
| List item spacing | 2pt after | `w:after="40"` |
|
||||
|
||||
For CJK documents, adjust: body font to SimSun/YaHei, line spacing to 1.5x (`w:line="360"`), and set `w:eastAsia` on all `w:rFonts`.
|
||||
@@ -0,0 +1,819 @@
|
||||
# Design Principles for Document Typography
|
||||
|
||||
WHY certain typographic choices look good -- the perceptual and psychological
|
||||
reasons behind professional document design. Use this to make judgment calls
|
||||
when exact specs are not provided.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
1. [White Space & Breathing Room](#1-white-space--breathing-room)
|
||||
2. [Contrast & Scale](#2-contrast--scale)
|
||||
3. [Proximity & Grouping](#3-proximity--grouping)
|
||||
4. [Alignment & Grid](#4-alignment--grid)
|
||||
5. [Repetition & Consistency](#5-repetition--consistency)
|
||||
6. [Visual Hierarchy & Flow](#6-visual-hierarchy--flow)
|
||||
|
||||
---
|
||||
|
||||
## 1. White Space & Breathing Room
|
||||
|
||||
### Why It Works
|
||||
|
||||
The human eye does not read continuously. It jumps in saccades, fixating on
|
||||
small clusters of words. White space provides landing zones for these fixations
|
||||
and gives the reader's peripheral vision a "frame" that makes each text block
|
||||
feel manageable. When a page is packed to the edges, every glance returns more
|
||||
text than working memory can buffer, triggering fatigue and avoidance.
|
||||
|
||||
Research on content density consistently shows:
|
||||
|
||||
- **60-70% content coverage** feels comfortable and professional.
|
||||
- **80%+** starts to feel dense and bureaucratic.
|
||||
- **90%+** feels oppressive -- the reader unconsciously rushes or skips.
|
||||
- **Below 50%** feels wasteful or pretentious (unless intentional, like poetry).
|
||||
|
||||
Wider margins also carry cultural signals. Academic and luxury documents use
|
||||
generous margins (1.25-1.5 inches). Internal memos and drafts use narrower
|
||||
margins (0.75-1.0 inches). The margin width tells the reader how much care
|
||||
went into the document before they read a single word.
|
||||
|
||||
Line spacing has a direct physiological basis: the eye must track back to the
|
||||
start of the next line after each line break. If lines are too close, the eye
|
||||
"slips" to the wrong line. If too far apart, the eye loses its sense of
|
||||
continuity. The sweet spot is 120-145% of the font size.
|
||||
|
||||
**Rule of thumb: when in doubt, add more space, not less.**
|
||||
|
||||
### Good Example
|
||||
|
||||
```
|
||||
Margins: 1 inch (1440 twips) all sides for business documents.
|
||||
Line spacing: 1.15 (276 twips at 240 twips-per-line = 115%).
|
||||
Paragraph spacing after: 8pt (160 twips) between body paragraphs.
|
||||
```
|
||||
|
||||
```xml
|
||||
<!-- Page margins: 1 inch = 1440 twips on all sides -->
|
||||
<w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440"
|
||||
w:header="720" w:footer="720" w:gutter="0"/>
|
||||
|
||||
<!-- Body paragraph: 1.15 line spacing, 8pt after -->
|
||||
<w:pPr>
|
||||
<w:spacing w:after="160" w:line="276" w:lineRule="auto"/>
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
This produces a page where content occupies roughly 65% of the area. The
|
||||
reader sees clear top/bottom breathing room, and paragraphs are distinct
|
||||
without feeling disconnected.
|
||||
|
||||
```
|
||||
Page layout (good):
|
||||
+----------------------------------+
|
||||
| 1" margin |
|
||||
| +------------------------+ |
|
||||
| | Heading | |
|
||||
| | | |
|
||||
| | Body text here with | |
|
||||
| | comfortable spacing | |
|
||||
| | between lines. | |
|
||||
| | | | <- visible gap between paragraphs
|
||||
| | Another paragraph of | |
|
||||
| | body text follows. | |
|
||||
| | | |
|
||||
| +------------------------+ |
|
||||
| 1" margin |
|
||||
+----------------------------------+
|
||||
```
|
||||
|
||||
### Bad Example
|
||||
|
||||
```xml
|
||||
<!-- Cramped margins: 0.5 inch = 720 twips -->
|
||||
<w:pgMar w:top="720" w:right="720" w:bottom="720" w:left="720"
|
||||
w:header="360" w:footer="360" w:gutter="0"/>
|
||||
|
||||
<!-- No paragraph spacing, single line spacing -->
|
||||
<w:pPr>
|
||||
<w:spacing w:after="0" w:line="240" w:lineRule="auto"/>
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
This fills ~85% of the page. Text runs edge-to-edge with no visual rest stops.
|
||||
The reader sees a wall of text.
|
||||
|
||||
```
|
||||
Page layout (bad):
|
||||
+----------------------------------+
|
||||
| Heading |
|
||||
| Body text crammed right up to |
|
||||
| the margins with no spacing |
|
||||
| between lines or paragraphs. |
|
||||
| Another paragraph starts here |
|
||||
| and the reader cannot tell where |
|
||||
| one idea ends and another begins |
|
||||
| because everything blurs into a |
|
||||
| single dense block of text. |
|
||||
+----------------------------------+
|
||||
```
|
||||
|
||||
### Quick Test
|
||||
|
||||
1. Zoom out to 50% in your document viewer. If you cannot see clear "channels"
|
||||
of white between text blocks, the spacing is too tight.
|
||||
2. Print a test page. Hold it at arm's length. The text area should look like
|
||||
a rectangle floating in white, not filling the page.
|
||||
3. Check: is the line spacing value at least 264 (`w:line` for 1.1x) for body
|
||||
text? If it is 240 (single), it is too tight for anything over 10pt.
|
||||
|
||||
---
|
||||
|
||||
## 2. Contrast & Scale
|
||||
|
||||
### Why It Works
|
||||
|
||||
The brain processes visual hierarchy through relative difference, not absolute
|
||||
size. A 20pt heading above 11pt body text creates a clear "this is important"
|
||||
signal. But if every heading is 20pt and every sub-heading is 19pt, the brain
|
||||
cannot distinguish them -- they merge into the same level.
|
||||
|
||||
The key insight is **modular scale**: font sizes that grow by a consistent
|
||||
ratio. This mirrors natural proportions and feels harmonious for the same
|
||||
reason musical intervals do.
|
||||
|
||||
Common scales and their character:
|
||||
|
||||
| Ratio | Name | Character | Example progression (from 11pt) |
|
||||
|-------|----------------|---------------------------------|---------------------------------|
|
||||
| 1.200 | Minor third | Subtle, refined | 11 → 13.2 → 15.8 → 19.0 |
|
||||
| 1.250 | Major third | Balanced, professional | 11 → 13.75 → 17.2 → 21.5 |
|
||||
| 1.333 | Perfect fourth | Strong, authoritative | 11 → 14.7 → 19.5 → 26.0 |
|
||||
| 1.414 | Augmented 4th | Dramatic, presentation-style | 11 → 15.6 → 22.0 → 31.1 |
|
||||
|
||||
For most business documents, 1.25 (major third) works best:
|
||||
|
||||
```
|
||||
Body = 11pt (w:sz="22")
|
||||
H3 = 13pt (w:sz="26") -- 11 * 1.25 ≈ 13.75, round to 13
|
||||
H2 = 16pt (w:sz="32") -- 13 * 1.25 ≈ 16.25, round to 16
|
||||
H1 = 20pt (w:sz="40") -- 16 * 1.25 = 20
|
||||
```
|
||||
|
||||
Beyond size, **weight contrast** creates hierarchy without consuming vertical
|
||||
space. Regular (400) vs Bold (700) is visible at any size. Semi-bold (600) vs
|
||||
Regular is subtle and best avoided unless you also vary size or color.
|
||||
|
||||
**Color contrast** adds a third dimension. Dark blue headings (#1F3864) against
|
||||
softer dark gray body text (#333333) signals "heading" without needing a huge
|
||||
size jump. Pure black (#000000) body text is harsher than necessary on white
|
||||
backgrounds -- #333333 or #2D2D2D reduces glare without losing legibility.
|
||||
|
||||
### Good Example
|
||||
|
||||
```xml
|
||||
<!-- H1: 20pt, bold, dark navy -->
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="40"/>
|
||||
<w:color w:val="1F3864"/>
|
||||
</w:rPr>
|
||||
|
||||
<!-- H2: 16pt, bold, dark navy -->
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="32"/>
|
||||
<w:color w:val="1F3864"/>
|
||||
</w:rPr>
|
||||
|
||||
<!-- H3: 13pt, bold, dark navy -->
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="26"/>
|
||||
<w:color w:val="1F3864"/>
|
||||
</w:rPr>
|
||||
|
||||
<!-- Body: 11pt, regular, dark gray -->
|
||||
<w:rPr>
|
||||
<w:sz w:val="22"/>
|
||||
<w:color w:val="333333"/>
|
||||
</w:rPr>
|
||||
```
|
||||
|
||||
```
|
||||
Visual hierarchy (good):
|
||||
|
||||
[████████████████████] <- H1: 20pt bold navy (clearly dominant)
|
||||
(generous space)
|
||||
[██████████████] <- H2: 16pt bold navy (distinct step down)
|
||||
(moderate space)
|
||||
[████████████] <- H3: 13pt bold navy (smaller but still bold)
|
||||
[░░░░░░░░░░░░░░░░░░░░░░] <- Body: 11pt regular gray
|
||||
[░░░░░░░░░░░░░░░░░░░░░░]
|
||||
[░░░░░░░░░░░░░░░░░░░░░░]
|
||||
```
|
||||
|
||||
Each level is visually distinct from its neighbors. You can identify the
|
||||
hierarchy even in peripheral vision.
|
||||
|
||||
### Bad Example
|
||||
|
||||
```xml
|
||||
<!-- H1: 14pt bold black -->
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="28"/>
|
||||
<w:color w:val="000000"/>
|
||||
</w:rPr>
|
||||
|
||||
<!-- H2: 13pt bold black -->
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="26"/>
|
||||
<w:color w:val="000000"/>
|
||||
</w:rPr>
|
||||
|
||||
<!-- H3: 12pt bold black -->
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="24"/>
|
||||
<w:color w:val="000000"/>
|
||||
</w:rPr>
|
||||
|
||||
<!-- Body: 12pt regular black -->
|
||||
<w:rPr>
|
||||
<w:sz w:val="24"/>
|
||||
<w:color w:val="000000"/>
|
||||
</w:rPr>
|
||||
```
|
||||
|
||||
Problems:
|
||||
- H3 (12pt bold) and body (12pt regular) differ only by weight -- too subtle.
|
||||
- H1 (14pt) to H2 (13pt) is a 1pt step -- invisible at reading distance.
|
||||
- Everything is pure black so color provides no differentiating signal.
|
||||
- The ratio between levels is ~1.07, far too flat.
|
||||
|
||||
### Quick Test
|
||||
|
||||
1. **The squint test**: blur your eyes or step back from the screen. Can you
|
||||
count the number of heading levels? If two levels merge, their contrast
|
||||
is insufficient.
|
||||
2. **Ratio check**: divide each heading size by the next smaller size. If any
|
||||
ratio is below 1.15, the levels will look too similar.
|
||||
3. **Color check**: do headings look distinct from body text when you glance
|
||||
at the page? If everything is the same color, you are relying solely on
|
||||
size/weight, which limits your hierarchy to ~3 effective levels.
|
||||
|
||||
---
|
||||
|
||||
## 3. Proximity & Grouping
|
||||
|
||||
### Why It Works
|
||||
|
||||
The Gestalt principle of proximity: items that are close together are perceived
|
||||
as belonging to the same group. In document typography, this means a heading
|
||||
must be **closer to the content it introduces** than to the content above it.
|
||||
|
||||
If a heading sits equidistant between two paragraphs, it looks orphaned -- the
|
||||
reader's eye does not know if it belongs to the text above or below. The fix
|
||||
is asymmetric spacing: **large space before the heading, small space after**.
|
||||
|
||||
The recommended ratio is 2:1 or 3:1 (space-before : space-after).
|
||||
|
||||
This same principle applies to:
|
||||
- **List items**: spacing between items should be less than spacing between
|
||||
paragraphs. Items in a list are a group and should visually cluster.
|
||||
- **Captions**: a figure caption should be close to its figure, not floating
|
||||
in the middle between the figure and the next paragraph.
|
||||
- **Table titles**: the title sits close above the table, with more space
|
||||
separating the title from preceding text.
|
||||
|
||||
### Good Example
|
||||
|
||||
```xml
|
||||
<!-- H2: 18pt before, 6pt after (3:1 ratio) -->
|
||||
<w:pPr>
|
||||
<w:pStyle w:val="Heading2"/>
|
||||
<w:spacing w:before="360" w:after="120"/>
|
||||
</w:pPr>
|
||||
|
||||
<!-- Body paragraph: 0pt before, 8pt after -->
|
||||
<w:pPr>
|
||||
<w:spacing w:before="0" w:after="160"/>
|
||||
</w:pPr>
|
||||
|
||||
<!-- List item: 0pt before, 2pt after (tight grouping) -->
|
||||
<w:pPr>
|
||||
<w:pStyle w:val="ListParagraph"/>
|
||||
<w:spacing w:before="0" w:after="40"/>
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
```
|
||||
Proximity (good):
|
||||
|
||||
...end of previous section text.
|
||||
<- 18pt gap (w:before="360")
|
||||
## Section Heading
|
||||
<- 6pt gap (w:after="120")
|
||||
First paragraph of new section
|
||||
continues here with content.
|
||||
<- 8pt gap (w:after="160")
|
||||
Second paragraph follows.
|
||||
|
||||
The heading clearly "belongs to" the text below it.
|
||||
```
|
||||
|
||||
```
|
||||
List grouping (good):
|
||||
|
||||
Consider these factors:
|
||||
- First item <- 2pt gap between items
|
||||
- Second item <- items cluster as a group
|
||||
- Third item
|
||||
<- 8pt gap after list
|
||||
The next paragraph starts here.
|
||||
```
|
||||
|
||||
### Bad Example
|
||||
|
||||
```xml
|
||||
<!-- H2: 12pt before, 12pt after (1:1 ratio -- orphaned heading) -->
|
||||
<w:pPr>
|
||||
<w:pStyle w:val="Heading2"/>
|
||||
<w:spacing w:before="240" w:after="240"/>
|
||||
</w:pPr>
|
||||
|
||||
<!-- List item: same spacing as body (10pt after) -->
|
||||
<w:pPr>
|
||||
<w:pStyle w:val="ListParagraph"/>
|
||||
<w:spacing w:before="0" w:after="200"/>
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
```
|
||||
Proximity (bad):
|
||||
|
||||
...end of previous section text.
|
||||
<- 12pt gap
|
||||
## Section Heading
|
||||
<- 12pt gap (same!)
|
||||
First paragraph of new section.
|
||||
|
||||
The heading floats between sections. It is unclear what it belongs to.
|
||||
```
|
||||
|
||||
```
|
||||
List grouping (bad):
|
||||
|
||||
Consider these factors:
|
||||
<- 10pt gap
|
||||
- First item
|
||||
<- 10pt gap (same as paragraphs)
|
||||
- Second item
|
||||
<- 10pt gap
|
||||
- Third item
|
||||
<- 10pt gap
|
||||
Next paragraph.
|
||||
|
||||
The list does not feel like a group. Each item looks like a
|
||||
separate paragraph that happens to have a bullet.
|
||||
```
|
||||
|
||||
### Quick Test
|
||||
|
||||
1. **Cover test**: cover the heading text. Looking only at the whitespace,
|
||||
can you tell which block of text the heading belongs to? If the gaps above
|
||||
and below are equal, the answer is "no."
|
||||
2. **Number check**: `w:before` on headings should be at least 2x `w:after`.
|
||||
Common good values: before=360 / after=120, or before=240 / after=80.
|
||||
3. **List check**: `w:after` on list items should be less than half of
|
||||
`w:after` on body paragraphs. If body uses 160, list items should use
|
||||
40-60.
|
||||
|
||||
---
|
||||
|
||||
## 4. Alignment & Grid
|
||||
|
||||
### Why It Works
|
||||
|
||||
Alignment creates invisible lines that the eye follows down the page. When
|
||||
elements share the same left edge, the reader perceives order and intention.
|
||||
When elements are slightly misaligned (off by a few twips), the page looks
|
||||
sloppy even if the reader cannot consciously identify why.
|
||||
|
||||
**Left-align vs Justify:**
|
||||
|
||||
- **Left-aligned** (ragged right) is best for English and other Latin-script
|
||||
languages. The uneven right edge actually helps reading because each line
|
||||
has a unique silhouette, making it easier for the eye to find the next line.
|
||||
Justified text forces uneven word spacing that creates distracting "rivers"
|
||||
of white running vertically through paragraphs.
|
||||
|
||||
- **Justified** is best for CJK text. Chinese, Japanese, and Korean characters
|
||||
are monospaced by design -- each occupies the same cell in an invisible grid.
|
||||
Justification preserves this grid perfectly. Ragged right in CJK text breaks
|
||||
the grid and looks untidy.
|
||||
|
||||
**Indentation rule:** Use first-line indent OR paragraph spacing to separate
|
||||
paragraphs -- never both. They serve the same purpose (marking paragraph
|
||||
boundaries). Using both wastes space and creates visual stutter.
|
||||
|
||||
- Western convention: paragraph spacing (no indent) is more modern.
|
||||
- CJK convention: first-line indent of 2 characters is standard.
|
||||
- Academic convention: first-line indent of 0.5 inch is traditional.
|
||||
|
||||
### Good Example
|
||||
|
||||
```xml
|
||||
<!-- English body: left-aligned, paragraph spacing, no indent -->
|
||||
<w:pPr>
|
||||
<w:jc w:val="left"/>
|
||||
<w:spacing w:after="160" w:line="276" w:lineRule="auto"/>
|
||||
<!-- No w:ind firstLine -->
|
||||
</w:pPr>
|
||||
|
||||
<!-- CJK body: justified, first-line indent 2 chars, no paragraph spacing -->
|
||||
<w:pPr>
|
||||
<w:jc w:val="both"/>
|
||||
<w:spacing w:after="0" w:line="360" w:lineRule="auto"/>
|
||||
<w:ind w:firstLineChars="200"/>
|
||||
</w:pPr>
|
||||
|
||||
<!-- Tab stops creating aligned columns -->
|
||||
<w:pPr>
|
||||
<w:tabs>
|
||||
<w:tab w:val="left" w:pos="2880"/> <!-- 2 inches -->
|
||||
<w:tab w:val="right" w:pos="9360"/> <!-- 6.5 inches (right margin) -->
|
||||
</w:tabs>
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
```
|
||||
English paragraph separation (good -- spacing, no indent):
|
||||
|
||||
This is the first paragraph with some text
|
||||
that wraps to a second line naturally.
|
||||
|
||||
This is the second paragraph. The gap above
|
||||
clearly marks the boundary.
|
||||
|
||||
|
||||
CJK paragraph separation (good -- indent, no spacing):
|
||||
|
||||
第一段正文内容从这里开始,使用两个字符
|
||||
的首行缩进来标记段落边界。
|
||||
第二段紧跟其后,没有段间距,但首行缩进
|
||||
清晰地标识了新段落的开始。
|
||||
```
|
||||
|
||||
### Bad Example
|
||||
|
||||
```xml
|
||||
<!-- English body: justified (creates word-spacing rivers) -->
|
||||
<w:pPr>
|
||||
<w:jc w:val="both"/>
|
||||
<w:spacing w:after="160" w:line="276" w:lineRule="auto"/>
|
||||
<w:ind w:firstLine="720"/> <!-- BOTH indent AND spacing: redundant -->
|
||||
</w:pPr>
|
||||
|
||||
<!-- CJK body: left-aligned (breaks character grid) -->
|
||||
<w:pPr>
|
||||
<w:jc w:val="left"/>
|
||||
<w:spacing w:after="200" w:line="276" w:lineRule="auto"/>
|
||||
<!-- No indent, using spacing instead -- unidiomatic for CJK -->
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
Problems:
|
||||
- Justified English text with narrow columns creates uneven word gaps.
|
||||
- Using both first-line indent AND paragraph spacing is redundant.
|
||||
- Left-aligned CJK breaks the character grid that CJK readers expect.
|
||||
- CJK with spacing-based separation looks like translated western layout.
|
||||
|
||||
### Quick Test
|
||||
|
||||
1. **River test**: in justified English text, squint and look for vertical
|
||||
white streaks running through the paragraph. If you see them, switch to
|
||||
left-align or increase the column width.
|
||||
2. **Double signal check**: does the document use BOTH first-line indent AND
|
||||
paragraph spacing? If yes, remove one. Choose indent for CJK/academic,
|
||||
spacing for modern western.
|
||||
3. **Tab alignment**: if you use tabs for columns, do all tab stops across
|
||||
the document use the same positions? Inconsistent tab stops create jagged
|
||||
invisible grid lines.
|
||||
|
||||
---
|
||||
|
||||
## 5. Repetition & Consistency
|
||||
|
||||
### Why It Works
|
||||
|
||||
Consistency is a trust signal. When a reader sees that every H2 looks the same,
|
||||
every table follows the same pattern, and every page number sits in the same
|
||||
spot, they unconsciously trust that the document was crafted with care. A single
|
||||
inconsistency -- one H2 that is 15pt instead of 14pt, one table with different
|
||||
borders -- breaks that trust and makes the reader question the content.
|
||||
|
||||
Consistency also reduces cognitive load. Once the reader learns "bold dark blue
|
||||
= section heading," they stop spending mental effort on identifying structure
|
||||
and focus entirely on content. Every inconsistency forces them to re-evaluate:
|
||||
"Is this a different kind of heading, or did someone just forget to apply the
|
||||
style?"
|
||||
|
||||
The implementation rule is simple: **use named styles, not direct formatting.**
|
||||
If you define Heading2 as a style and apply it everywhere, consistency is
|
||||
automatic. If you manually set font size, bold, and color on each heading
|
||||
individually, inconsistency is inevitable.
|
||||
|
||||
### Good Example
|
||||
|
||||
```xml
|
||||
<!-- Define styles once in styles.xml -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading2">
|
||||
<w:name w:val="heading 2"/>
|
||||
<w:basedOn w:val="Normal"/>
|
||||
<w:next w:val="Normal"/>
|
||||
<w:pPr>
|
||||
<w:keepNext/>
|
||||
<w:keepLines/>
|
||||
<w:spacing w:before="360" w:after="120"/>
|
||||
<w:outlineLvl w:val="1"/>
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:rFonts w:asciiTheme="majorHAnsi" w:hAnsiTheme="majorHAnsi"/>
|
||||
<w:b/>
|
||||
<w:sz w:val="32"/>
|
||||
<w:color w:val="1F3864"/>
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Apply consistently: every H2 references the style -->
|
||||
<w:p>
|
||||
<w:pPr>
|
||||
<w:pStyle w:val="Heading2"/>
|
||||
<!-- No direct formatting overrides -->
|
||||
</w:pPr>
|
||||
<w:r><w:t>Market Analysis</w:t></w:r>
|
||||
</w:p>
|
||||
```
|
||||
|
||||
When using a table style, define it once and reference it for every table:
|
||||
|
||||
```xml
|
||||
<!-- All tables reference the same style -->
|
||||
<w:tblPr>
|
||||
<w:tblStyle w:val="GridTable4Accent1"/>
|
||||
<w:tblW w:w="0" w:type="auto"/>
|
||||
</w:tblPr>
|
||||
```
|
||||
|
||||
### Bad Example
|
||||
|
||||
```xml
|
||||
<!-- First H2: manually formatted -->
|
||||
<w:p>
|
||||
<w:pPr>
|
||||
<w:spacing w:before="360" w:after="120"/>
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="32"/>
|
||||
<w:color w:val="1F3864"/>
|
||||
</w:rPr>
|
||||
<w:t>Market Analysis</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
|
||||
<!-- Second H2: slightly different (16pt instead of 16pt? No, 15pt!) -->
|
||||
<w:p>
|
||||
<w:pPr>
|
||||
<w:spacing w:before="240" w:after="160"/> <!-- different spacing! -->
|
||||
</w:pPr>
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="30"/> <!-- 15pt instead of 16pt! -->
|
||||
<w:color w:val="2E74B5"/> <!-- different shade of blue! -->
|
||||
</w:rPr>
|
||||
<w:t>Financial Overview</w:t>
|
||||
</w:r>
|
||||
</w:p>
|
||||
```
|
||||
|
||||
Problems:
|
||||
- No style references -- everything is direct formatting.
|
||||
- Second H2 has different size (30 vs 32), color, and spacing.
|
||||
- If there are 20 headings, each could drift slightly differently.
|
||||
- Changing the design later means editing every heading individually.
|
||||
|
||||
### Quick Test
|
||||
|
||||
1. **Style audit**: does every paragraph reference a `w:pStyle`? If you find
|
||||
paragraphs with only direct formatting and no style, that is a consistency
|
||||
risk.
|
||||
2. **Search for variance**: search the XML for all `w:sz` values used with
|
||||
`w:b` (bold). If you find three different sizes for what should be the same
|
||||
heading level, there is an inconsistency.
|
||||
3. **Table check**: do all tables in the document reference the same
|
||||
`w:tblStyle`? If some tables have manual border definitions while others
|
||||
use a style, the document will look patchy.
|
||||
4. **Page numbers**: check that header/footer content is defined in the
|
||||
default section properties and inherited by all sections, not redefined
|
||||
inconsistently in each section.
|
||||
|
||||
---
|
||||
|
||||
## 6. Visual Hierarchy & Flow
|
||||
|
||||
### Why It Works
|
||||
|
||||
A well-designed document guides the reader's eye in a predictable path:
|
||||
title at the top, subtitle below it, section headings as signposts, body text
|
||||
as the main content, footnotes and captions as supporting details. This flow
|
||||
mirrors reading priority -- the most important information is the most visually
|
||||
prominent.
|
||||
|
||||
Each level in the hierarchy must be **distinguishable from its adjacent
|
||||
levels**. It is not enough for H1 to differ from body text; H1 must also
|
||||
clearly differ from H2, and H2 from H3. If any two adjacent levels are too
|
||||
similar, the hierarchy collapses at that point.
|
||||
|
||||
Effective hierarchy uses **multiple simultaneous signals**:
|
||||
|
||||
| Level | Size | Weight | Color | Spacing above |
|
||||
|----------|-------|---------|---------|---------------|
|
||||
| Title | 26pt | Bold | #1F3864 | 0 (top) |
|
||||
| Subtitle | 15pt | Regular | #4472C4 | 4pt |
|
||||
| H1 | 20pt | Bold | #1F3864 | 24pt |
|
||||
| H2 | 16pt | Bold | #1F3864 | 18pt |
|
||||
| H3 | 13pt | Bold | #1F3864 | 12pt |
|
||||
| Body | 11pt | Regular | #333333 | 0pt |
|
||||
| Caption | 9pt | Italic | #666666 | 4pt |
|
||||
| Footnote | 9pt | Regular | #666666 | 0pt |
|
||||
|
||||
Notice how each level differs from its neighbors on at least two dimensions
|
||||
(size + weight, or size + color, or weight + style). Single-dimension
|
||||
differences are fragile and can be missed.
|
||||
|
||||
**Section breaks** create rhythm in long documents. A page break before each
|
||||
major section (H1) gives the reader a mental reset. Within sections, consistent
|
||||
heading + body patterns create a predictable cadence that makes long documents
|
||||
less intimidating.
|
||||
|
||||
### Good Example
|
||||
|
||||
```xml
|
||||
<!-- Title: large, bold, navy, centered -->
|
||||
<w:style w:type="paragraph" w:styleId="Title">
|
||||
<w:pPr>
|
||||
<w:jc w:val="center"/>
|
||||
<w:spacing w:after="80"/>
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="52"/>
|
||||
<w:color w:val="1F3864"/>
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Subtitle: medium, regular weight, lighter blue, centered -->
|
||||
<w:style w:type="paragraph" w:styleId="Subtitle">
|
||||
<w:pPr>
|
||||
<w:jc w:val="center"/>
|
||||
<w:spacing w:after="320"/>
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:sz w:val="30"/>
|
||||
<w:color w:val="4472C4"/>
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- H1: page break before, large bold navy -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading1">
|
||||
<w:pPr>
|
||||
<w:pageBreakBefore/>
|
||||
<w:keepNext/>
|
||||
<w:keepLines/>
|
||||
<w:spacing w:before="480" w:after="160"/>
|
||||
<w:outlineLvl w:val="0"/>
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="40"/>
|
||||
<w:color w:val="1F3864"/>
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Caption: small, italic, gray -->
|
||||
<w:style w:type="paragraph" w:styleId="Caption">
|
||||
<w:pPr>
|
||||
<w:spacing w:before="80" w:after="200"/>
|
||||
</w:pPr>
|
||||
<w:rPr>
|
||||
<w:i/>
|
||||
<w:sz w:val="18"/>
|
||||
<w:color w:val="666666"/>
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
```
|
||||
|
||||
```
|
||||
Visual flow (good):
|
||||
|
||||
+----------------------------------+
|
||||
| |
|
||||
| ANNUAL REPORT 2025 | <- Title: 26pt bold navy centered
|
||||
| Acme Corporation | <- Subtitle: 15pt regular blue
|
||||
| |
|
||||
| |
|
||||
+----------------------------------+
|
||||
|
||||
+----------------------------------+
|
||||
| |
|
||||
| 1. Executive Summary | <- H1: 20pt bold navy (page break)
|
||||
| |
|
||||
| Body text introducing the | <- Body: 11pt regular gray
|
||||
| main findings of the year. |
|
||||
| |
|
||||
| 1.1 Revenue Highlights | <- H2: 16pt bold navy
|
||||
| |
|
||||
| Revenue grew by 23% year | <- Body
|
||||
| over year, driven by... |
|
||||
| |
|
||||
| Figure 1: Revenue Growth | <- Caption: 9pt italic gray
|
||||
| |
|
||||
+----------------------------------+
|
||||
|
||||
Each level is immediately identifiable. The eye flows naturally
|
||||
from title -> heading -> body -> caption.
|
||||
```
|
||||
|
||||
### Bad Example
|
||||
|
||||
```xml
|
||||
<!-- All headings same color as body, minimal size difference -->
|
||||
<w:style w:type="paragraph" w:styleId="Heading1">
|
||||
<w:rPr>
|
||||
<w:b/>
|
||||
<w:sz w:val="28"/> <!-- 14pt -- only 3pt above body -->
|
||||
<w:color w:val="000000"/> <!-- same color as body -->
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- Caption same size as body, not italic -->
|
||||
<w:style w:type="paragraph" w:styleId="Caption">
|
||||
<w:rPr>
|
||||
<w:sz w:val="22"/> <!-- same 11pt as body! -->
|
||||
<w:color w:val="000000"/> <!-- same color as body -->
|
||||
</w:rPr>
|
||||
</w:style>
|
||||
|
||||
<!-- No page breaks between major sections -->
|
||||
<!-- H1 has no pageBreakBefore, keepNext, or keepLines -->
|
||||
```
|
||||
|
||||
Problems:
|
||||
- H1 at 14pt is too close to body at 11pt (ratio 1.27 -- acceptable in
|
||||
isolation but with black color matching body, the hierarchy is weak).
|
||||
- Caption is indistinguishable from body text.
|
||||
- No page breaks means major sections bleed into each other with no
|
||||
visual rhythm.
|
||||
- Everything is black, so color provides zero hierarchy signal.
|
||||
|
||||
### Quick Test
|
||||
|
||||
1. **The squint test**: blur your eyes while looking at a full page. You
|
||||
should see 3-4 distinct "weight levels" of gray. If the page looks like
|
||||
one uniform shade, the hierarchy is too flat.
|
||||
2. **The scan test**: flip through pages quickly. Can you identify section
|
||||
boundaries in under one second per page? If yes, the visual hierarchy is
|
||||
working. If pages blur together, you need stronger differentiation at H1.
|
||||
3. **Adjacent level test**: for each heading level, check that it differs
|
||||
from the next level on at least 2 of: size, weight, color, style (italic).
|
||||
Single-dimension differences get lost.
|
||||
4. **Rhythm test**: in a document over 10 pages, do major sections (H1) start
|
||||
on new pages? If not, long documents will feel like an undifferentiated
|
||||
stream. Add `w:pageBreakBefore` to Heading1.
|
||||
|
||||
---
|
||||
|
||||
## Summary: Decision Checklist
|
||||
|
||||
When you are unsure about a typographic choice, run through these checks:
|
||||
|
||||
| Principle | Question | If No... |
|
||||
|-----------|----------|----------|
|
||||
| White Space | Does the page have at least 30% white space? | Increase margins or spacing |
|
||||
| Contrast | Can I count heading levels by squinting? | Increase size ratios (target 1.25x) |
|
||||
| Proximity | Does each heading clearly belong to text below it? | Make space-before > space-after (2:1) |
|
||||
| Alignment | Is English left-aligned and CJK justified? | Switch alignment mode |
|
||||
| Repetition | Do all same-level elements use the same style? | Replace direct formatting with styles |
|
||||
| Hierarchy | Can I see the document structure at arm's length? | Add more differentiation signals |
|
||||
|
||||
**When two principles conflict, prioritize in this order:**
|
||||
|
||||
1. **Readability** (white space, line spacing) -- always wins
|
||||
2. **Hierarchy** (contrast, scale) -- readers must find what they need
|
||||
3. **Consistency** (repetition) -- builds trust
|
||||
4. **Aesthetics** (alignment, grouping) -- the finishing touch
|
||||
@@ -0,0 +1,308 @@
|
||||
# OpenXML Child Element Ordering Rules
|
||||
|
||||
Element ordering in OpenXML is defined by the XSD schema. Incorrect ordering produces invalid documents that Word may refuse to open or silently repair (potentially losing data).
|
||||
|
||||
> **Key rule**: Properties elements (`*Pr`) must always be the **first child** of their parent.
|
||||
|
||||
---
|
||||
|
||||
## w:document
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:background [0..1] — page background color/fill
|
||||
2. w:body [0..1] — document content container
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## w:body
|
||||
|
||||
```
|
||||
Children in order (repeating group):
|
||||
1. w:p [0..*] — paragraph
|
||||
2. w:tbl [0..*] — table
|
||||
3. w:sdt [0..*] — structured document tag (content control)
|
||||
4. w:sectPr [0..1] — LAST child: final section properties
|
||||
```
|
||||
|
||||
Note: `w:p`, `w:tbl`, and `w:sdt` are interleaved in document order. The only strict rule is that `w:sectPr` must be the **last child** of `w:body`.
|
||||
|
||||
---
|
||||
|
||||
## w:p (Paragraph)
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:pPr [0..1] — paragraph properties (MUST be first)
|
||||
|
||||
Then any mix of (interleaved in document order):
|
||||
- w:r [0..*] — run
|
||||
- w:hyperlink [0..*] — hyperlink wrapper
|
||||
- w:ins [0..*] — tracked insertion
|
||||
- w:del [0..*] — tracked deletion
|
||||
- w:bookmarkStart [0..*] — bookmark anchor start
|
||||
- w:bookmarkEnd [0..*] — bookmark anchor end
|
||||
- w:commentRangeStart [0..*] — comment range start
|
||||
- w:commentRangeEnd [0..*] — comment range end
|
||||
- w:proofErr [0..*] — proofing error marker
|
||||
- w:fldSimple [0..*] — simple field
|
||||
- w:sdt [0..*] — inline content control
|
||||
- w:smartTag [0..*] — smart tag
|
||||
```
|
||||
|
||||
**Practical note**: After `w:pPr`, the remaining children appear in document reading order. Runs, hyperlinks, bookmarks, and comment ranges intermix freely based on their position in the text.
|
||||
|
||||
---
|
||||
|
||||
## w:pPr (Paragraph Properties)
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:pStyle [0..1] — paragraph style reference
|
||||
2. w:keepNext [0..1] — keep with next paragraph
|
||||
3. w:keepLines [0..1] — keep lines together
|
||||
4. w:pageBreakBefore [0..1] — page break before paragraph
|
||||
5. w:framePr [0..1] — text frame properties
|
||||
6. w:widowControl [0..1] — widow/orphan control
|
||||
7. w:numPr [0..1] — numbering properties
|
||||
8. w:suppressLineNumbers [0..1]
|
||||
9. w:pBdr [0..1] — paragraph borders
|
||||
10. w:shd [0..1] — shading
|
||||
11. w:tabs [0..1] — tab stops
|
||||
12. w:suppressAutoHyphens [0..1]
|
||||
13. w:kinsoku [0..1] — CJK kinsoku settings
|
||||
14. w:wordWrap [0..1]
|
||||
15. w:overflowPunct [0..1]
|
||||
16. w:topLinePunct [0..1]
|
||||
17. w:autoSpaceDE [0..1]
|
||||
18. w:autoSpaceDN [0..1]
|
||||
19. w:bidi [0..1] — right-to-left paragraph
|
||||
20. w:adjustRightInd [0..1]
|
||||
21. w:snapToGrid [0..1]
|
||||
22. w:spacing [0..1] — line and paragraph spacing
|
||||
23. w:ind [0..1] — indentation
|
||||
24. w:contextualSpacing [0..1]
|
||||
25. w:mirrorIndents [0..1]
|
||||
26. w:suppressOverlap [0..1]
|
||||
27. w:jc [0..1] — justification (left/center/right/both)
|
||||
28. w:textDirection [0..1]
|
||||
29. w:textAlignment [0..1]
|
||||
30. w:outlineLvl [0..1] — outline level
|
||||
31. w:divId [0..1]
|
||||
32. w:rPr [0..1] — run properties for paragraph mark
|
||||
33. w:sectPr [0..1] — section break (section ends at this paragraph)
|
||||
34. w:pPrChange [0..1] — tracked paragraph property change
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## w:r (Run)
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:rPr [0..1] — run properties (MUST be first)
|
||||
|
||||
Then any of (one per run, typically):
|
||||
- w:t [0..*] — text content
|
||||
- w:br [0..*] — break (line, page, column)
|
||||
- w:tab [0..*] — tab character
|
||||
- w:cr [0..*] — carriage return
|
||||
- w:sym [0..*] — symbol character
|
||||
- w:drawing [0..*] — DrawingML object (images)
|
||||
- w:pict [0..*] — VML picture (legacy)
|
||||
- w:fldChar [0..*] — complex field character
|
||||
- w:instrText [0..*] — field instruction text
|
||||
- w:delText [0..*] — deleted text (inside w:del)
|
||||
- w:footnoteReference [0..*]
|
||||
- w:endnoteReference [0..*]
|
||||
- w:commentReference [0..*]
|
||||
- w:lastRenderedPageBreak [0..*]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## w:rPr (Run Properties)
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:rStyle [0..1] — character style reference
|
||||
2. w:rFonts [0..1] — font specification
|
||||
3. w:b [0..1] — bold
|
||||
4. w:bCs [0..1] — complex script bold
|
||||
5. w:i [0..1] — italic
|
||||
6. w:iCs [0..1] — complex script italic
|
||||
7. w:caps [0..1] — all capitals
|
||||
8. w:smallCaps [0..1] — small capitals
|
||||
9. w:strike [0..1] — strikethrough
|
||||
10. w:dstrike [0..1] — double strikethrough
|
||||
11. w:outline [0..1]
|
||||
12. w:shadow [0..1]
|
||||
13. w:emboss [0..1]
|
||||
14. w:imprint [0..1]
|
||||
15. w:noProof [0..1] — suppress proofing
|
||||
16. w:snapToGrid [0..1]
|
||||
17. w:vanish [0..1] — hidden text
|
||||
18. w:color [0..1] — text color
|
||||
19. w:spacing [0..1] — character spacing
|
||||
20. w:w [0..1] — character width scaling
|
||||
21. w:kern [0..1] — font kerning
|
||||
22. w:position [0..1] — vertical position (raise/lower)
|
||||
23. w:sz [0..1] — font size (half-points)
|
||||
24. w:szCs [0..1] — complex script font size
|
||||
25. w:highlight [0..1] — text highlight color
|
||||
26. w:u [0..1] — underline
|
||||
27. w:effect [0..1] — text effect (animated)
|
||||
28. w:bdr [0..1] — run border
|
||||
29. w:shd [0..1] — run shading
|
||||
30. w:vertAlign [0..1] — superscript/subscript
|
||||
31. w:rtl [0..1] — right-to-left
|
||||
32. w:cs [0..1] — complex script
|
||||
33. w:lang [0..1] — language
|
||||
34. w:rPrChange [0..1] — tracked run property change
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## w:tbl (Table)
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:tblPr [1..1] — table properties (REQUIRED, must be first)
|
||||
2. w:tblGrid [1..1] — column width definitions (REQUIRED)
|
||||
3. w:tr [1..*] — table row(s)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## w:tblPr (Table Properties)
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:tblStyle [0..1] — table style reference
|
||||
2. w:tblpPr [0..1] — table positioning
|
||||
3. w:tblOverlap [0..1]
|
||||
4. w:bidiVisual [0..1] — right-to-left table
|
||||
5. w:tblStyleRowBandSize [0..1]
|
||||
6. w:tblStyleColBandSize [0..1]
|
||||
7. w:tblW [0..1] — preferred table width
|
||||
8. w:jc [0..1] — table alignment
|
||||
9. w:tblCellSpacing [0..1]
|
||||
10. w:tblInd [0..1] — table indent from margin
|
||||
11. w:tblBorders [0..1] — table borders
|
||||
12. w:shd [0..1] — table shading
|
||||
13. w:tblLayout [0..1] — fixed or autofit
|
||||
14. w:tblCellMar [0..1] — default cell margins
|
||||
15. w:tblLook [0..1] — conditional formatting flags
|
||||
16. w:tblCaption [0..1] — accessibility caption
|
||||
17. w:tblDescription [0..1] — accessibility description
|
||||
18. w:tblPrChange [0..1] — tracked table property change
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## w:tr (Table Row)
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:trPr [0..1] — row properties (must be first)
|
||||
2. w:tc [1..*] — table cell(s)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## w:trPr (Table Row Properties)
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:cnfStyle [0..1] — conditional formatting
|
||||
2. w:divId [0..1]
|
||||
3. w:gridBefore [0..1] — grid columns before first cell
|
||||
4. w:gridAfter [0..1] — grid columns after last cell
|
||||
5. w:wBefore [0..1]
|
||||
6. w:wAfter [0..1]
|
||||
7. w:cantSplit [0..1] — don't split row across pages
|
||||
8. w:trHeight [0..1] — row height
|
||||
9. w:tblHeader [0..1] — repeat as header row
|
||||
10. w:tblCellSpacing [0..1]
|
||||
11. w:jc [0..1] — row alignment
|
||||
12. w:hidden [0..1]
|
||||
13. w:ins [0..1] — tracked row insertion
|
||||
14. w:del [0..1] — tracked row deletion
|
||||
15. w:trPrChange [0..1] — tracked row property change
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## w:tc (Table Cell)
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:tcPr [0..1] — cell properties (must be first)
|
||||
2. w:p [1..*] — paragraph(s) — at least one required
|
||||
3. w:tbl [0..*] — nested table(s)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## w:tcPr (Table Cell Properties)
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:cnfStyle [0..1]
|
||||
2. w:tcW [0..1] — cell width
|
||||
3. w:gridSpan [0..1] — horizontal merge (column span)
|
||||
4. w:hMerge [0..1] — legacy horizontal merge
|
||||
5. w:vMerge [0..1] — vertical merge
|
||||
6. w:tcBorders [0..1] — cell borders
|
||||
7. w:shd [0..1] — cell shading
|
||||
8. w:noWrap [0..1]
|
||||
9. w:tcMar [0..1] — cell margins
|
||||
10. w:textDirection [0..1]
|
||||
11. w:tcFitText [0..1]
|
||||
12. w:vAlign [0..1] — vertical alignment
|
||||
13. w:hideMark [0..1]
|
||||
14. w:tcPrChange [0..1] — tracked cell property change
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## w:sectPr (Section Properties)
|
||||
|
||||
```
|
||||
Children in order:
|
||||
1. w:headerReference [0..*] — header references (type: default/first/even)
|
||||
2. w:footerReference [0..*] — footer references
|
||||
3. w:endnotePr [0..1]
|
||||
4. w:footnotePr [0..1]
|
||||
5. w:type [0..1] — section break type (nextPage/continuous/evenPage/oddPage)
|
||||
6. w:pgSz [0..1] — page size
|
||||
7. w:pgMar [0..1] — page margins
|
||||
8. w:paperSrc [0..1]
|
||||
9. w:pgBorders [0..1] — page borders
|
||||
10. w:lnNumType [0..1] — line numbering
|
||||
11. w:pgNumType [0..1] — page numbering
|
||||
12. w:cols [0..1] — column definitions
|
||||
13. w:formProt [0..1]
|
||||
14. w:vAlign [0..1] — vertical alignment of page
|
||||
15. w:noEndnote [0..1]
|
||||
16. w:titlePg [0..1] — different first page header/footer
|
||||
17. w:textDirection [0..1]
|
||||
18. w:bidi [0..1]
|
||||
19. w:rtlGutter [0..1]
|
||||
20. w:docGrid [0..1] — document grid
|
||||
21. w:sectPrChange [0..1] — tracked section property change
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## w:hdr (Header) / w:ftr (Footer)
|
||||
|
||||
```
|
||||
Children (same structure as w:body content):
|
||||
1. w:p [0..*] — paragraph(s)
|
||||
2. w:tbl [0..*] — table(s)
|
||||
3. w:sdt [0..*] — content controls
|
||||
```
|
||||
|
||||
Headers and footers are essentially mini-documents. They follow the same content model as `w:body` but without a final `w:sectPr`.
|
||||
+4061
File diff suppressed because it is too large
Load Diff
+2820
File diff suppressed because it is too large
Load Diff
+3381
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,82 @@
|
||||
# OpenXML Namespaces, Relationship Types, and Content Types
|
||||
|
||||
## Core Namespaces
|
||||
|
||||
| Prefix | URI | Used In |
|
||||
|--------|-----|---------|
|
||||
| `w` | `http://schemas.openxmlformats.org/wordprocessingml/2006/main` | document.xml, styles.xml, numbering.xml, headers, footers |
|
||||
| `r` | `http://schemas.openxmlformats.org/officeDocument/2006/relationships` | Relationship references (r:id) |
|
||||
| `wp` | `http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing` | Image/drawing placement in document |
|
||||
| `a` | `http://schemas.openxmlformats.org/drawingml/2006/main` | DrawingML core (shapes, images, themes) |
|
||||
| `pic` | `http://schemas.openxmlformats.org/drawingml/2006/picture` | Picture element in DrawingML |
|
||||
| `v` | `urn:schemas-microsoft-com:vml` | VML (legacy shapes, watermarks) |
|
||||
| `o` | `urn:schemas-microsoft-com:office:office` | Office VML extensions |
|
||||
| `m` | `http://schemas.openxmlformats.org/officeDocument/2006/math` | Math equations (OMML) |
|
||||
| `mc` | `http://schemas.openxmlformats.org/markup-compatibility/2006` | Markup compatibility (Ignorable, AlternateContent) |
|
||||
|
||||
## Extended Namespaces
|
||||
|
||||
| Prefix | URI | Purpose |
|
||||
|--------|-----|---------|
|
||||
| `w14` | `http://schemas.microsoft.com/office/word/2010/wordml` | Word 2010 extensions (contentPart, etc.) |
|
||||
| `w15` | `http://schemas.microsoft.com/office/word/2012/wordml` | Word 2013 extensions (commentEx, etc.) |
|
||||
| `w16cid` | `http://schemas.microsoft.com/office/word/2016/wordml/cid` | Comment IDs (durable IDs) |
|
||||
| `w16cex` | `http://schemas.microsoft.com/office/word/2018/wordml/cex` | Comment extensible |
|
||||
| `w16se` | `http://schemas.microsoft.com/office/word/2015/wordml/symex` | Symbol extensions |
|
||||
| `wps` | `http://schemas.microsoft.com/office/word/2010/wordprocessingShape` | WordprocessingML shapes |
|
||||
| `wpc` | `http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas` | Drawing canvas |
|
||||
|
||||
## Relationship Types
|
||||
|
||||
| Relationship | Type URI |
|
||||
|-------------|----------|
|
||||
| Document | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument` |
|
||||
| Styles | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles` |
|
||||
| Numbering | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering` |
|
||||
| Font Table | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable` |
|
||||
| Settings | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings` |
|
||||
| Theme | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme` |
|
||||
| Image | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/image` |
|
||||
| Hyperlink | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink` |
|
||||
| Header | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/header` |
|
||||
| Footer | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer` |
|
||||
| Comments | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments` |
|
||||
| CommentsExtended | `http://schemas.microsoft.com/office/2011/relationships/commentsExtended` |
|
||||
| CommentsIds | `http://schemas.microsoft.com/office/2016/09/relationships/commentsIds` |
|
||||
| CommentsExtensible | `http://schemas.microsoft.com/office/2018/08/relationships/commentsExtensible` |
|
||||
| Footnotes | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes` |
|
||||
| Endnotes | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes` |
|
||||
| Glossary | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/glossaryDocument` |
|
||||
| Web Settings | `http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings` |
|
||||
|
||||
## Content Types (`[Content_Types].xml`)
|
||||
|
||||
### Default Extensions
|
||||
|
||||
```xml
|
||||
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml" />
|
||||
<Default Extension="xml" ContentType="application/xml" />
|
||||
<Default Extension="png" ContentType="image/png" />
|
||||
<Default Extension="jpeg" ContentType="image/jpeg" />
|
||||
<Default Extension="gif" ContentType="image/gif" />
|
||||
<Default Extension="emf" ContentType="image/x-emf" />
|
||||
```
|
||||
|
||||
### Part Overrides
|
||||
|
||||
| Part | Content Type |
|
||||
|------|-------------|
|
||||
| `/word/document.xml` | `application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml` |
|
||||
| `/word/styles.xml` | `application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml` |
|
||||
| `/word/numbering.xml` | `application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml` |
|
||||
| `/word/settings.xml` | `application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml` |
|
||||
| `/word/fontTable.xml` | `application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml` |
|
||||
| `/word/theme/theme1.xml` | `application/vnd.openxmlformats-officedocument.theme+xml` |
|
||||
| `/word/header1.xml` | `application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml` |
|
||||
| `/word/footer1.xml` | `application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml` |
|
||||
| `/word/comments.xml` | `application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml` |
|
||||
| `/word/commentsExtended.xml` | `application/vnd.ms-word.commentsExtended+xml` |
|
||||
| `/word/commentsIds.xml` | `application/vnd.ms-word.commentsIds+xml` |
|
||||
| `/word/commentsExtensible.xml` | `application/vnd.ms-word.commentsExtensible+xml` |
|
||||
| `/word/footnotes.xml` | `application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml` |
|
||||
| `/word/endnotes.xml` | `application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml` |
|
||||
@@ -0,0 +1,72 @@
|
||||
# OpenXML Unit Conversion Quick Reference
|
||||
|
||||
## Master Conversion Table
|
||||
|
||||
| Unit | 1 inch | 1 cm | 1 mm | 1 pt | Description |
|
||||
|------|--------|------|------|------|-------------|
|
||||
| DXA (twips) | 1440 | 567 | 56.7 | 20 | 1/20 of a point. Used for margins, indents, spacing, page size. |
|
||||
| EMU | 914400 | 360000 | 36000 | 12700 | English Metric Unit. Used for images, drawings, shapes. |
|
||||
| Half-points | 144 | 56.7 | 5.67 | 2 | Used for font sizes (`w:sz`, `w:szCs`). |
|
||||
| Points | 72 | 28.35 | 2.835 | 1 | Standard typographic unit. Not used directly in most attributes. |
|
||||
| Eighths of a point | 576 | 226.8 | 22.68 | 8 | Used for `w:spacing` character spacing. |
|
||||
|
||||
## Common Page Sizes
|
||||
|
||||
| Size | Width (DXA) | Height (DXA) | Width (mm) | Height (mm) |
|
||||
|------|-------------|--------------|------------|-------------|
|
||||
| A4 | 11906 | 16838 | 210 | 297 |
|
||||
| Letter | 12240 | 15840 | 215.9 | 279.4 |
|
||||
| Legal | 12240 | 20160 | 215.9 | 355.6 |
|
||||
| A3 | 16838 | 23811 | 297 | 420 |
|
||||
| A5 | 8391 | 11906 | 148 | 210 |
|
||||
|
||||
## Common Margin Values
|
||||
|
||||
| Margin | DXA | Inches | cm |
|
||||
|--------|-----|--------|----|
|
||||
| 0.5 inch | 720 | 0.5 | 1.27 |
|
||||
| 0.75 inch | 1080 | 0.75 | 1.91 |
|
||||
| 1 inch | 1440 | 1.0 | 2.54 |
|
||||
| 1.25 inch | 1800 | 1.25 | 3.18 |
|
||||
| 1.5 inch | 2160 | 1.5 | 3.81 |
|
||||
|
||||
## Font Size Values (`w:sz`)
|
||||
|
||||
| Display Size | w:sz value | Notes |
|
||||
|-------------|-----------|-------|
|
||||
| 8pt | 16 | |
|
||||
| 9pt | 18 | |
|
||||
| 10pt | 20 | |
|
||||
| 10.5pt | 21 | Common CJK body size |
|
||||
| 11pt | 22 | Default Calibri body |
|
||||
| 12pt | 24 | Default TNR body |
|
||||
| 14pt | 28 | Small heading |
|
||||
| 16pt | 32 | |
|
||||
| 18pt | 36 | |
|
||||
| 20pt | 40 | |
|
||||
| 24pt | 48 | |
|
||||
| 28pt | 56 | |
|
||||
| 36pt | 72 | |
|
||||
|
||||
## Line Spacing Values
|
||||
|
||||
Line spacing in `w:spacing` uses the `w:line` attribute in 240ths of a line (when `w:lineRule="auto"`):
|
||||
|
||||
| Spacing | w:line value | w:lineRule |
|
||||
|---------|-------------|-----------|
|
||||
| Single | 240 | auto |
|
||||
| 1.15 (Word default) | 276 | auto |
|
||||
| 1.5 | 360 | auto |
|
||||
| Double | 480 | auto |
|
||||
| Exact 12pt | 240 | exact |
|
||||
| At least 12pt | 240 | atLeast |
|
||||
|
||||
Note: When `lineRule="exact"` or `"atLeast"`, `w:line` is in **twips** (DXA), not 240ths. So `line="240"` with `lineRule="exact"` means exactly 12pt (240/20 = 12pt).
|
||||
|
||||
## Conversion Formulas
|
||||
|
||||
```
|
||||
DXA = inches × 1440 = cm × 567 = pt × 20
|
||||
EMU = inches × 914400 = cm × 360000 = pt × 12700
|
||||
sz = pt × 2 (half-points)
|
||||
```
|
||||
@@ -0,0 +1,284 @@
|
||||
# Scenario A: Creating a New DOCX from Scratch
|
||||
|
||||
## When to Use
|
||||
|
||||
Use Scenario A when:
|
||||
- The user has no existing file and wants a brand new document
|
||||
- The user provides content (text, tables, images) and wants it assembled into a DOCX
|
||||
- The user specifies a document type (report, letter, memo, academic) or describes a custom layout
|
||||
|
||||
Do NOT use when: the user already has a DOCX they want to modify (→ Scenario B) or wants to restyle an existing document (→ Scenario C).
|
||||
|
||||
---
|
||||
|
||||
## Step-by-Step Workflow
|
||||
|
||||
### 1. Determine Document Type
|
||||
|
||||
Ask or infer the document type from the user's request:
|
||||
|
||||
| Type | Typical Signals |
|
||||
|------|----------------|
|
||||
| Report | "report", "analysis", "whitepaper", sections with headings |
|
||||
| Letter | "letter", "dear", address block, salutation |
|
||||
| Memo | "memo", "memorandum", To/From/Subject fields |
|
||||
| Academic | "paper", "essay", "thesis", APA/MLA/Chicago mention |
|
||||
| Custom | None of the above, or user specifies exact formatting |
|
||||
|
||||
### 2. Gather Content Requirements
|
||||
|
||||
Collect from the user:
|
||||
- Title and subtitle (if any)
|
||||
- Author / organization
|
||||
- Section structure (headings and nesting)
|
||||
- Body content per section
|
||||
- Tables (headers + rows)
|
||||
- Images (file paths or placeholders)
|
||||
- Special elements: TOC, page numbers, watermark, headers/footers
|
||||
|
||||
### 3. Select Style Set
|
||||
|
||||
Based on document type, load the matching styles XML asset:
|
||||
- Report → `assets/styles/default_styles.xml` or `assets/styles/corporate_styles.xml`
|
||||
- Academic → `assets/styles/academic_styles.xml`
|
||||
- Letter / Memo / Custom → `assets/styles/default_styles.xml` (with overrides)
|
||||
|
||||
### 4. Configure Page Setup
|
||||
|
||||
Set `w:sectPr` values based on document type defaults (see below) or user overrides.
|
||||
|
||||
```xml
|
||||
<w:sectPr>
|
||||
<w:pgSz w:w="11906" w:h="16838" /> <!-- A4 -->
|
||||
<w:pgMar w:top="1440" w:right="1440" w:bottom="1440" w:left="1440"
|
||||
w:header="720" w:footer="720" w:gutter="0" />
|
||||
</w:sectPr>
|
||||
```
|
||||
|
||||
### 5. Build Document Structure
|
||||
|
||||
Assemble `word/document.xml` with:
|
||||
1. `w:body` as root container
|
||||
2. Paragraphs (`w:p`) with heading styles for section titles
|
||||
3. Body paragraphs with `Normal` style
|
||||
4. Tables, images, and other elements as needed
|
||||
5. Final `w:sectPr` as last child of `w:body`
|
||||
|
||||
### 6. Apply Typography Defaults
|
||||
|
||||
Set document-level defaults in `styles.xml` under `w:docDefaults`:
|
||||
```xml
|
||||
<w:docDefaults>
|
||||
<w:rPrDefault>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" w:eastAsia="SimSun" w:cs="Arial" />
|
||||
<w:sz w:val="22" /> <!-- 11pt -->
|
||||
<w:szCs w:val="22" />
|
||||
</w:rPr>
|
||||
</w:rPrDefault>
|
||||
<w:pPrDefault>
|
||||
<w:pPr>
|
||||
<w:spacing w:after="160" w:line="259" w:lineRule="auto" />
|
||||
</w:pPr>
|
||||
</w:pPrDefault>
|
||||
</w:docDefaults>
|
||||
```
|
||||
|
||||
### 7. Add Complex Elements
|
||||
|
||||
See the Complex Elements Guide section below.
|
||||
|
||||
### 8. Run Validation Pipeline
|
||||
|
||||
```
|
||||
dotnet run ... validate --xsd wml-subset.xsd
|
||||
dotnet run ... validate --xsd business-rules.xsd # if applying a template
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Document Type Defaults
|
||||
|
||||
### Report
|
||||
| Property | Value |
|
||||
|----------|-------|
|
||||
| Body font | Calibri 11pt |
|
||||
| Heading font | Calibri Light |
|
||||
| H1 / H2 / H3 / H4 size | 28pt / 24pt / 18pt / 14pt |
|
||||
| Heading color | #2F5496 (corporate blue) |
|
||||
| Margins | 1 inch (1440 DXA) all sides |
|
||||
| Page size | A4 (11906 × 16838 DXA) |
|
||||
| Line spacing | Single (line="240") |
|
||||
| Paragraph spacing | 0pt before, 8pt after body |
|
||||
|
||||
### Letter
|
||||
| Property | Value |
|
||||
|----------|-------|
|
||||
| Font | Calibri 11pt |
|
||||
| Page size | Letter (12240 × 15840 DXA) |
|
||||
| Margins | 1 inch all sides |
|
||||
| Structure | Date → Address → Salutation → Body → Closing → Signature |
|
||||
| Line spacing | Single |
|
||||
|
||||
### Memo
|
||||
| Property | Value |
|
||||
|----------|-------|
|
||||
| Font | Arial 11pt |
|
||||
| Page size | Letter |
|
||||
| Margins | 0.75 inch (1080 DXA) |
|
||||
| Header | "MEMO" centered, bold, 16pt |
|
||||
| Fields | To, From, Date, Subject (bold labels, tab-aligned values) |
|
||||
|
||||
### Academic
|
||||
| Property | Value |
|
||||
|----------|-------|
|
||||
| Font | Times New Roman 12pt |
|
||||
| Line spacing | Double (line="480") |
|
||||
| Margins | 1 inch all sides |
|
||||
| Page size | Letter |
|
||||
| Headings | Bold, same font, 14/13/12pt for H1/H2/H3 |
|
||||
| First line indent | 0.5 inch (720 DXA) |
|
||||
| Heading color | Black (no color) |
|
||||
|
||||
---
|
||||
|
||||
## Content Configuration JSON Format
|
||||
|
||||
The CLI `create` command accepts a JSON config:
|
||||
|
||||
```json
|
||||
{
|
||||
"type": "report",
|
||||
"title": "Quarterly Revenue Analysis",
|
||||
"subtitle": "Q1 2026",
|
||||
"author": "Finance Team",
|
||||
"pageSize": "A4",
|
||||
"margins": { "top": 1440, "right": 1440, "bottom": 1440, "left": 1440 },
|
||||
"sections": [
|
||||
{
|
||||
"heading": "Executive Summary",
|
||||
"level": 1,
|
||||
"content": [
|
||||
{ "type": "paragraph", "text": "Revenue grew 12% year-over-year..." },
|
||||
{
|
||||
"type": "table",
|
||||
"headers": ["Region", "Revenue", "Growth"],
|
||||
"rows": [
|
||||
["North America", "$4.2M", "+15%"],
|
||||
["Europe", "$2.8M", "+8%"],
|
||||
["Asia Pacific", "$1.9M", "+18%"]
|
||||
]
|
||||
},
|
||||
{ "type": "image", "path": "charts/revenue.png", "width": "5in", "alt": "Revenue chart" }
|
||||
]
|
||||
},
|
||||
{
|
||||
"heading": "Detailed Analysis",
|
||||
"level": 1,
|
||||
"content": [
|
||||
{ "type": "paragraph", "text": "Breaking down by product line..." }
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Supported content types:
|
||||
- `paragraph` — body text (applies Normal style)
|
||||
- `table` — headers + rows (applies TableGrid style)
|
||||
- `image` — inline image with width/height control
|
||||
- `list` — bulleted or numbered list items
|
||||
- `pageBreak` — forces a page break
|
||||
|
||||
---
|
||||
|
||||
## Complex Elements Guide
|
||||
|
||||
### Table of Contents
|
||||
|
||||
Insert a TOC field code. Word will update the actual entries when the file is opened:
|
||||
|
||||
```xml
|
||||
<w:p>
|
||||
<w:pPr><w:pStyle w:val="TOCHeading" /></w:pPr>
|
||||
<w:r><w:t>Table of Contents</w:t></w:r>
|
||||
</w:p>
|
||||
<w:p>
|
||||
<w:r>
|
||||
<w:fldChar w:fldCharType="begin" />
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:instrText xml:space="preserve"> TOC \o "1-3" \h \z \u </w:instrText>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:fldChar w:fldCharType="separate" />
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:t>[Table of contents — update to populate]</w:t>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:fldChar w:fldCharType="end" />
|
||||
</w:r>
|
||||
</w:p>
|
||||
```
|
||||
|
||||
### Page Numbers in Footer
|
||||
|
||||
Add a footer part (`word/footer1.xml`) and reference it in `w:sectPr`:
|
||||
|
||||
```xml
|
||||
<!-- In footer1.xml -->
|
||||
<w:ftr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
||||
<w:p>
|
||||
<w:pPr><w:jc w:val="center" /></w:pPr>
|
||||
<w:r>
|
||||
<w:fldChar w:fldCharType="begin" />
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:instrText>PAGE</w:instrText>
|
||||
</w:r>
|
||||
<w:r>
|
||||
<w:fldChar w:fldCharType="separate" />
|
||||
</w:r>
|
||||
<w:r><w:t>1</w:t></w:r>
|
||||
<w:r>
|
||||
<w:fldChar w:fldCharType="end" />
|
||||
</w:r>
|
||||
</w:p>
|
||||
</w:ftr>
|
||||
|
||||
<!-- In sectPr -->
|
||||
<w:footerReference w:type="default" r:id="rId8" />
|
||||
```
|
||||
|
||||
### Watermark
|
||||
|
||||
Add a header part with a shape behind the text:
|
||||
|
||||
```xml
|
||||
<w:hdr>
|
||||
<w:p>
|
||||
<w:r>
|
||||
<w:pict>
|
||||
<v:shape style="position:absolute;margin-left:0;margin-top:0;width:468pt;height:180pt;
|
||||
z-index:-251657216;mso-position-horizontal:center;
|
||||
mso-position-vertical:center"
|
||||
fillcolor="silver" stroked="f">
|
||||
<v:textpath style="font-family:'Calibri';font-size:1pt" string="DRAFT" />
|
||||
</v:shape>
|
||||
</w:pict>
|
||||
</w:r>
|
||||
</w:p>
|
||||
</w:hdr>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Post-Creation Checklist
|
||||
|
||||
1. **Validate** against `wml-subset.xsd` — all elements in correct order, required attributes present
|
||||
2. **Merge adjacent runs** with identical formatting to keep XML clean
|
||||
3. **Verify relationships** — every `r:id` in document.xml has a matching entry in `document.xml.rels`
|
||||
4. **Check content types** — every part in the package is registered in `[Content_Types].xml`
|
||||
5. **Preview** — open in Word or LibreOffice to visually confirm layout
|
||||
6. **File size** — confirm images are reasonably sized (compress if > 2MB each)
|
||||
@@ -0,0 +1,295 @@
|
||||
# Scenario B: Editing / Filling Content in Existing DOCX
|
||||
|
||||
## Core Principle
|
||||
|
||||
**"First, do no harm."** When editing an existing document, minimize changes. Touch only what needs to change. Preserve all formatting, styles, relationships, and structure that are not directly involved in the edit.
|
||||
|
||||
---
|
||||
|
||||
## When to Use
|
||||
|
||||
- Replacing placeholder text (`{{name}}`, `$DATE$`, `[PLACEHOLDER]`)
|
||||
- Updating specific paragraphs or table cells
|
||||
- Filling in form fields
|
||||
- Adding or removing paragraphs in a known location
|
||||
- Inserting tracked changes for review workflows
|
||||
|
||||
Do NOT use when: the user wants to change the look/style of the entire document (→ Scenario C) or create from scratch (→ Scenario A).
|
||||
|
||||
---
|
||||
|
||||
## Workflow
|
||||
|
||||
```
|
||||
1. Preview → CLI: analyze <input.docx>
|
||||
2. Analyze → Understand structure: sections, styles, headings, tables
|
||||
3. Identify → Locate exact edit targets (paragraph index, table index, placeholder text)
|
||||
4. Edit → Apply surgical changes via CLI or direct XML
|
||||
5. Validate → CLI: validate <output.docx>
|
||||
6. Diff → Compare before/after to verify only intended changes were made
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## When to Use API vs Direct XML
|
||||
|
||||
### Use CLI Edit Command When:
|
||||
- Replacing placeholder text (e.g., `{{fieldName}}` → actual value)
|
||||
- Filling table data from JSON
|
||||
- Updating document properties (title, author)
|
||||
- Simple text insertions or deletions
|
||||
|
||||
### Use Direct XML Manipulation When:
|
||||
- Text spans multiple runs with different formatting (run-boundary issues)
|
||||
- Adding complex structures (nested tables, multi-image layouts)
|
||||
- Manipulating Track Changes markup
|
||||
- Modifying header/footer content
|
||||
- Adjusting section properties
|
||||
|
||||
---
|
||||
|
||||
## Placeholder Patterns
|
||||
|
||||
The CLI natively supports `{{fieldName}}` placeholders:
|
||||
|
||||
```bash
|
||||
# Replace all {{placeholders}} from a JSON map
|
||||
dotnet run ... edit input.docx --fill-placeholders data.json --output filled.docx
|
||||
```
|
||||
|
||||
Where `data.json`:
|
||||
```json
|
||||
{
|
||||
"companyName": "Acme Corp",
|
||||
"date": "March 21, 2026",
|
||||
"amount": "$15,000.00",
|
||||
"recipientName": "Jane Smith"
|
||||
}
|
||||
```
|
||||
|
||||
Other placeholder formats (`$FIELD$`, `[PLACEHOLDER]`) require text replacement:
|
||||
```bash
|
||||
dotnet run ... edit input.docx --replace "$DATE$" "March 21, 2026" --output updated.docx
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Text Replacement Strategies
|
||||
|
||||
### Simple Replacement
|
||||
|
||||
When the entire search text is within a single `w:r` (run):
|
||||
|
||||
```xml
|
||||
<!-- Before -->
|
||||
<w:r>
|
||||
<w:rPr><w:b /></w:rPr>
|
||||
<w:t>{{companyName}}</w:t>
|
||||
</w:r>
|
||||
|
||||
<!-- After — formatting preserved -->
|
||||
<w:r>
|
||||
<w:rPr><w:b /></w:rPr>
|
||||
<w:t>Acme Corp</w:t>
|
||||
</w:r>
|
||||
```
|
||||
|
||||
Direct replacement. The run's `w:rPr` is untouched.
|
||||
|
||||
### Complex Replacement (Split Runs)
|
||||
|
||||
When the search text is split across multiple runs (common when Word applies spell-check or formatting mid-text):
|
||||
|
||||
```xml
|
||||
<!-- "{{companyName}}" split into 3 runs -->
|
||||
<w:r><w:rPr><w:b /></w:rPr><w:t>{{company</w:t></w:r>
|
||||
<w:r><w:rPr><w:b /><w:i /></w:rPr><w:t>Na</w:t></w:r>
|
||||
<w:r><w:rPr><w:b /></w:rPr><w:t>me}}</w:t></w:r>
|
||||
```
|
||||
|
||||
Strategy:
|
||||
1. Concatenate text across runs to find the match
|
||||
2. Place the replacement text in the **first** run (preserving its `w:rPr`)
|
||||
3. Remove the text from subsequent runs (or remove the runs entirely if empty)
|
||||
|
||||
```xml
|
||||
<!-- After -->
|
||||
<w:r><w:rPr><w:b /></w:rPr><w:t>Acme Corp</w:t></w:r>
|
||||
```
|
||||
|
||||
**Rule**: Always preserve the formatting of the first run in the match.
|
||||
|
||||
---
|
||||
|
||||
## Table Editing
|
||||
|
||||
### By Index
|
||||
|
||||
Tables are 0-indexed in document order:
|
||||
|
||||
```bash
|
||||
dotnet run ... edit input.docx --table-index 0 --table-data data.json --output updated.docx
|
||||
```
|
||||
|
||||
### By Header Matching
|
||||
|
||||
Find a table by its header row content:
|
||||
|
||||
```bash
|
||||
dotnet run ... edit input.docx --table-match "Name,Amount,Date" --table-data data.json
|
||||
```
|
||||
|
||||
### Table Data JSON Format
|
||||
|
||||
```json
|
||||
{
|
||||
"rows": [
|
||||
["Alice Johnson", "$5,000", "2026-03-15"],
|
||||
["Bob Smith", "$3,200", "2026-03-18"]
|
||||
],
|
||||
"appendRows": true
|
||||
}
|
||||
```
|
||||
|
||||
- `appendRows: true` — add rows after existing data
|
||||
- `appendRows: false` (default) — replace all data rows (keeps header row)
|
||||
|
||||
### Direct XML Table Editing
|
||||
|
||||
To modify a specific cell, locate it by row/column index:
|
||||
|
||||
```xml
|
||||
<!-- Row 2 (0-indexed), Column 1 -->
|
||||
<w:tr> <!-- tr[2] -->
|
||||
<w:tc>...</w:tc>
|
||||
<w:tc> <!-- tc[1] — target cell -->
|
||||
<w:p>
|
||||
<w:r><w:t>Old Value</w:t></w:r>
|
||||
</w:p>
|
||||
</w:tc>
|
||||
</w:tr>
|
||||
```
|
||||
|
||||
Replace the `w:t` content. Do NOT modify `w:tcPr` (cell properties) or `w:tblPr` (table properties).
|
||||
|
||||
---
|
||||
|
||||
## Track Changes Guidance
|
||||
|
||||
### When to Add Revision Marks
|
||||
- User explicitly requests tracked changes
|
||||
- Document already has tracking enabled (`w:trackChanges` in settings)
|
||||
- Collaborative review workflow
|
||||
|
||||
### When NOT to Add Revision Marks
|
||||
- Form filling / placeholder replacement (these are "completing" the document, not "revising" it)
|
||||
- Direct edits where the user wants a clean result
|
||||
- Batch data filling operations
|
||||
|
||||
### Adding Tracked Changes
|
||||
|
||||
See `references/track_changes_guide.md` for full XML examples.
|
||||
|
||||
Quick reference — inserting text with tracking:
|
||||
```xml
|
||||
<w:ins w:id="1" w:author="MiniMaxAI" w:date="2026-03-21T10:00:00Z">
|
||||
<w:r>
|
||||
<w:t>New text here</w:t>
|
||||
</w:r>
|
||||
</w:ins>
|
||||
```
|
||||
|
||||
Deleting text with tracking:
|
||||
```xml
|
||||
<w:del w:id="2" w:author="MiniMaxAI" w:date="2026-03-21T10:00:00Z">
|
||||
<w:r>
|
||||
<w:delText>Removed text</w:delText> <!-- MUST use delText, not t -->
|
||||
</w:r>
|
||||
</w:del>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Common Pitfalls
|
||||
|
||||
### 1. Breaking Run Boundaries
|
||||
|
||||
**Problem**: Replacing text that spans runs by naively modifying individual runs destroys inline formatting.
|
||||
|
||||
**Fix**: Concatenate run text, find match boundaries, consolidate into the first run, remove consumed runs.
|
||||
|
||||
### 2. Hyperlink Content
|
||||
|
||||
**Problem**: Replacing text inside a `w:hyperlink` element without preserving the hyperlink wrapper removes the link.
|
||||
|
||||
```xml
|
||||
<w:hyperlink r:id="rId5">
|
||||
<w:r>
|
||||
<w:rPr><w:rStyle w:val="Hyperlink" /></w:rPr>
|
||||
<w:t>Click here</w:t> <!-- Only replace this text -->
|
||||
</w:r>
|
||||
</w:hyperlink>
|
||||
```
|
||||
|
||||
**Fix**: Only modify the `w:t` inside the hyperlink's run. Never remove or replace the `w:hyperlink` element itself.
|
||||
|
||||
### 3. Tracked Change Context
|
||||
|
||||
**Problem**: Replacing text that is inside a `w:ins` or `w:del` element without understanding the revision context creates invalid markup.
|
||||
|
||||
**Fix**: If the target text is inside a revision mark, either:
|
||||
- Replace within the revision context (preserving the `w:ins`/`w:del` wrapper)
|
||||
- Or delete the old revision and create a new one
|
||||
|
||||
### 4. Style Preservation
|
||||
|
||||
**Problem**: Adding new paragraphs without specifying a style causes them to inherit `Normal`, which may not match the surrounding context.
|
||||
|
||||
**Fix**: When inserting paragraphs, copy the `w:pStyle` from an adjacent paragraph of the same type.
|
||||
|
||||
### 5. Numbering Continuity
|
||||
|
||||
**Problem**: Inserting a new list item breaks numbering sequence.
|
||||
|
||||
**Fix**: Ensure the new paragraph has the same `w:numId` and `w:ilvl` as adjacent list items. If continuing a sequence, set `w:numPr` to match.
|
||||
|
||||
### 6. XML Special Characters
|
||||
|
||||
**Problem**: User content contains `&`, `<`, `>`, `"`, `'` — these must be escaped in XML.
|
||||
|
||||
**Fix**: Always XML-escape user-provided text before inserting into `w:t` elements:
|
||||
- `&` → `&`
|
||||
- `<` → `<`
|
||||
- `>` → `>`
|
||||
- `"` → `"`
|
||||
- `'` → `'`
|
||||
|
||||
### 7. Whitespace Preservation
|
||||
|
||||
**Problem**: Leading/trailing spaces in `w:t` are stripped by XML parsers.
|
||||
|
||||
**Fix**: Add `xml:space="preserve"` attribute:
|
||||
```xml
|
||||
<w:t xml:space="preserve"> text with leading space</w:t>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Diff Verification
|
||||
|
||||
After editing, always compare the before and after states:
|
||||
|
||||
```bash
|
||||
# Structural diff — shows only changed elements
|
||||
dotnet run ... diff original.docx modified.docx
|
||||
|
||||
# Text-only diff — shows content changes
|
||||
dotnet run ... diff original.docx modified.docx --text-only
|
||||
```
|
||||
|
||||
Verify:
|
||||
- Only intended text changed
|
||||
- No styles were modified
|
||||
- No relationships were added/removed unexpectedly
|
||||
- Table structure intact (same number of rows/columns unless intentionally changed)
|
||||
- Images and other media unchanged
|
||||
@@ -0,0 +1,456 @@
|
||||
# Scenario C: Applying Formatting / Templates
|
||||
|
||||
## When to Use
|
||||
|
||||
Use Scenario C when:
|
||||
- The user has an existing document and wants to apply a different visual style
|
||||
- The user wants to rebrand a document (new fonts, colors, heading styles)
|
||||
- The user provides a template DOCX and wants its look applied to a content document
|
||||
- The user wants consistent formatting across multiple documents
|
||||
|
||||
Do NOT use when: the user wants to edit content (→ Scenario B) or create from scratch (→ Scenario A).
|
||||
|
||||
---
|
||||
|
||||
## Workflow
|
||||
|
||||
```
|
||||
1. Analyze source → CLI: analyze source.docx (list styles, fonts, structure)
|
||||
2. Analyze template → CLI: analyze template.docx (list styles, fonts, structure)
|
||||
3. Map styles → Create mapping plan (source style → template style)
|
||||
4. Apply template → CLI: apply-template source.docx --template template.docx --output result.docx
|
||||
5. Validate (XSD) → CLI: validate result.docx --xsd wml-subset.xsd
|
||||
6. GATE-CHECK → CLI: validate result.docx --xsd business-rules.xsd ← MUST PASS
|
||||
7. Diff verify → CLI: diff source.docx result.docx --text-only (content must be identical)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## What Gets Copied from Template
|
||||
|
||||
| Part | File | Description |
|
||||
|------|------|-------------|
|
||||
| Styles | `word/styles.xml` | All style definitions (paragraph, character, table, numbering) |
|
||||
| Theme | `word/theme/theme1.xml` | Color scheme, font scheme, format scheme |
|
||||
| Numbering | `word/numbering.xml` | List and numbering definitions |
|
||||
| Headers | `word/header*.xml` | Header content and formatting |
|
||||
| Footers | `word/footer*.xml` | Footer content and formatting |
|
||||
| Section props | `w:sectPr` | Margins, page size, orientation, columns |
|
||||
|
||||
## What Does NOT Get Copied
|
||||
|
||||
| Part | Reason |
|
||||
|------|--------|
|
||||
| Document content | Paragraphs, tables, images stay from source |
|
||||
| Comments | Belong to source document's review history |
|
||||
| Tracked changes | Belong to source document's revision history |
|
||||
| Custom XML parts | Application-specific data, not visual |
|
||||
| Document properties | Title, author, dates belong to source |
|
||||
| Glossary document | Template's building blocks are not transferred |
|
||||
|
||||
---
|
||||
|
||||
## Template Structure Analysis (REQUIRED)
|
||||
|
||||
Before choosing Overlay or Base-Replace, you MUST analyze the template's internal structure. This is the #1 cause of failure when skipped.
|
||||
|
||||
### Step 1: Count template paragraphs and identify structural zones
|
||||
|
||||
Run `$CLI analyze --input template.docx` or manually inspect:
|
||||
|
||||
```bash
|
||||
# Quick structure scan
|
||||
scripts/docx_preview.sh template.docx
|
||||
```
|
||||
|
||||
Identify these zones in the template:
|
||||
```
|
||||
Zone A: Front matter (cover page, declaration, abstract, TOC)
|
||||
→ These are KEPT from template, never replaced
|
||||
Zone B: Example/placeholder body content ("第1章 XXX", sample paragraphs)
|
||||
→ This is REPLACED with user's actual content
|
||||
Zone C: Back matter (appendices, acknowledgments, blank pages)
|
||||
→ These are KEPT from template or removed
|
||||
Zone D: Final sectPr
|
||||
→ ALWAYS kept from template
|
||||
```
|
||||
|
||||
### Step 2: Find Zone B boundaries (replacement range)
|
||||
|
||||
Search the template's document.xml for anchor text that marks the start and end of example content:
|
||||
|
||||
**Start anchor patterns** (first paragraph of example body):
|
||||
- "第1章", "第一章", "Chapter 1", "1 Introduction", "绪论"
|
||||
- The first paragraph with a Heading1-equivalent style after TOC
|
||||
|
||||
**End anchor patterns** (last paragraph before back matter):
|
||||
- "参考文献", "References", "致谢", "Acknowledgments"
|
||||
- The last paragraph before appendices or final sectPr
|
||||
|
||||
```python
|
||||
# Pseudocode for finding replacement range
|
||||
for i, element in enumerate(template_body_elements):
|
||||
text = get_text(element)
|
||||
style = get_style(element)
|
||||
if style in heading1_styles and ("第1章" in text or "Chapter 1" in text):
|
||||
replace_start = i
|
||||
if "参考文献" in text or "References" in text:
|
||||
replace_end = i
|
||||
break
|
||||
```
|
||||
|
||||
**CRITICAL**: Verify the range by printing what's inside:
|
||||
```
|
||||
Template elements [0..replace_start-1]: front matter (KEEP)
|
||||
Template elements [replace_start..replace_end]: example content (REPLACE)
|
||||
Template elements [replace_end+1..end]: back matter (KEEP)
|
||||
```
|
||||
|
||||
If replace_start or replace_end cannot be found, DO NOT proceed. Ask the user to identify the replacement boundaries.
|
||||
|
||||
### Step 3: Decide Overlay vs Base-Replace
|
||||
|
||||
Now that you know the structure:
|
||||
|
||||
| Observation | Decision |
|
||||
|-------------|----------|
|
||||
| Template has ≤30 paragraphs, no cover/TOC | **C-1: Overlay** (pure style template) |
|
||||
| Template has >100 paragraphs with cover/TOC/example sections | **C-2: Base-Replace** |
|
||||
| Template paragraph count ≈ user document | **C-1: Overlay** (similar structure) |
|
||||
| Template paragraph count >> user document (e.g., 263 vs 134) | **C-2: Base-Replace** |
|
||||
|
||||
### Step 4: For Base-Replace, execute the replacement
|
||||
|
||||
1. Load template as base (all files)
|
||||
2. Extract user content elements using `list(body)` — NOT `findall('w:p')` (which misses tables)
|
||||
3. Build new body: `template[0:replace_start] + cleaned_user_content + template[replace_end+1:]`
|
||||
4. Apply style mapping to every paragraph
|
||||
5. Clean direct formatting (see rules below)
|
||||
6. Rebuild document.xml, keeping template's namespace declarations
|
||||
7. Merge relationships (images + hyperlinks)
|
||||
8. Write output using template as ZIP base
|
||||
|
||||
---
|
||||
|
||||
## Style Mapping Strategy
|
||||
|
||||
When template style names differ from source style names, a mapping is required. **This step is mandatory** — skipping it is the #1 cause of formatting failures in template application.
|
||||
|
||||
### Step 0: Extract StyleIds from Both Documents (REQUIRED)
|
||||
|
||||
Before any template application, extract and compare styleIds from both documents:
|
||||
|
||||
```bash
|
||||
# Extract all styleIds from source
|
||||
$CLI analyze --input source.docx --styles-only
|
||||
# Output example:
|
||||
# Heading1 (paragraph, basedOn: Normal)
|
||||
# Heading2 (paragraph, basedOn: Normal)
|
||||
# Normal (paragraph)
|
||||
# ListBullet (paragraph, basedOn: Normal)
|
||||
|
||||
# Extract all styleIds from template
|
||||
$CLI analyze --input template.docx --styles-only
|
||||
# Output example:
|
||||
# 1 (paragraph, basedOn: a, name: "heading 1")
|
||||
# 2 (paragraph, basedOn: a, name: "heading 2")
|
||||
# 3 (paragraph, basedOn: a, name: "heading 3")
|
||||
# a (paragraph, name: "Normal")
|
||||
# a0 (character, name: "Default Paragraph Font")
|
||||
```
|
||||
|
||||
**Critical distinction**: `w:styleId` vs `w:name`:
|
||||
```xml
|
||||
<!-- styleId="1" but name="heading 1" -->
|
||||
<w:style w:type="paragraph" w:styleId="1">
|
||||
<w:name w:val="heading 1"/>
|
||||
<w:basedOn w:val="a"/>
|
||||
</w:style>
|
||||
```
|
||||
|
||||
The `w:styleId` attribute is what `<w:pStyle w:val="..."/>` references. The `w:name` attribute is the human-readable display name. **They can be completely different.** Many CJK templates use numeric styleIds (`1`, `2`, `3`, `a`, `a0`) instead of English names.
|
||||
|
||||
### Tier 1: Exact StyleId Match
|
||||
If source uses `Heading1` and template defines `Heading1` as a styleId, map directly. No action needed.
|
||||
|
||||
### Tier 2: Name-Based Match
|
||||
If no exact styleId match, try matching by `w:name` attribute:
|
||||
- Source `Heading1` (name="heading 1") → Template styleId `1` (name="heading 1")
|
||||
- Match is case-insensitive on the name value
|
||||
|
||||
Within the same type, also try matching by:
|
||||
- Built-in style ID (Word's internal ID, e.g., heading 1 = built-in ID 1)
|
||||
- Style type (paragraph → paragraph, character → character, table → table)
|
||||
|
||||
### Tier 3: Manual Mapping
|
||||
For renamed or custom styles, provide an explicit mapping:
|
||||
|
||||
```json
|
||||
{
|
||||
"styleMap": {
|
||||
"Heading1": "1",
|
||||
"Heading2": "2",
|
||||
"Heading3": "3",
|
||||
"Heading4": "3",
|
||||
"Normal": "a",
|
||||
"BodyText": "a",
|
||||
"ListBullet": "a",
|
||||
"CompanyName": "Title",
|
||||
"OldTableStyle": "TableGrid"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Common Non-Standard StyleId Patterns
|
||||
|
||||
| Template Origin | StyleId Pattern | Example |
|
||||
|----------------|-----------------|---------|
|
||||
| Chinese Word (default) | Numeric/alphabetic | `1`, `2`, `3`, `a`, `a0` |
|
||||
| English Word (default) | English names | `Heading1`, `Normal`, `Title` |
|
||||
| Google Docs export | Prefixed | `Subtitle`, `NormalWeb` |
|
||||
| WPS Office | Mixed | `1`, `Heading1`, custom names |
|
||||
| Academic templates | Custom | `ThesisHeading1`, `ThesisBody` |
|
||||
|
||||
### Building the Mapping Table
|
||||
|
||||
Follow this algorithm:
|
||||
|
||||
1. **List source styleIds** actually used in `document.xml` (not all defined in `styles.xml`):
|
||||
```python
|
||||
# Pseudocode: find all unique pStyle values in source document.xml
|
||||
used_styles = set()
|
||||
for p in body.iter('w:p'):
|
||||
pStyle = p.find('w:pPr/w:pStyle')
|
||||
if pStyle is not None:
|
||||
used_styles.add(pStyle.get('val'))
|
||||
```
|
||||
|
||||
2. **For each used style**, find the best match in template:
|
||||
- First try: exact styleId match
|
||||
- Second try: match by `w:name` value (case-insensitive)
|
||||
- Third try: match by style purpose (any heading → template's heading style)
|
||||
- Fallback: map to template's default paragraph style (usually `Normal` or `a`)
|
||||
|
||||
3. **Validate the mapping** — every source styleId must map to an existing template styleId:
|
||||
```
|
||||
✓ Heading1 → 1 (name match: "heading 1")
|
||||
✓ Heading2 → 2 (name match: "heading 2")
|
||||
✓ Normal → a (name match: "Normal")
|
||||
✗ CustomCallout → ??? (no match found, will fallback to 'a')
|
||||
```
|
||||
|
||||
4. **Apply the mapping** when copying content — update every `<w:pStyle w:val="..."/>`:
|
||||
```xml
|
||||
<!-- Source -->
|
||||
<w:pPr><w:pStyle w:val="Heading1"/></w:pPr>
|
||||
<!-- After mapping -->
|
||||
<w:pPr><w:pStyle w:val="1"/></w:pPr>
|
||||
```
|
||||
|
||||
### Unmapped Styles
|
||||
Styles in the source document that have no match in the template are logged as warnings:
|
||||
```
|
||||
WARNING: Style 'CustomCallout' has no mapping in template. Content will fall back to 'a' (Normal).
|
||||
```
|
||||
|
||||
The content is preserved; only the style reference is updated to the template's default paragraph style.
|
||||
|
||||
### C-2 BASE-REPLACE: Additional StyleId Considerations
|
||||
|
||||
When using the template as a base document (C-2 strategy), the template's `styles.xml` is already in place. You must:
|
||||
|
||||
1. **Never copy source `styles.xml`** — the template's styles are the authority
|
||||
2. **Map every content paragraph's pStyle** to the template's styleId before insertion
|
||||
3. **Strip direct formatting selectively** (see detailed rules below) — let the template style control appearance
|
||||
4. **Verify table styles** — if source tables use `TableGrid` but template defines it as `a3` or similar, remap `<w:tblStyle>` too
|
||||
5. **Check character styles** — `rPr` inside runs may reference character styles like `Hyperlink` or `Strong` that have different IDs in the template
|
||||
|
||||
### Direct Formatting Cleanup Rules (Detailed)
|
||||
|
||||
When copying content from source to template, apply these rules to EACH paragraph and run:
|
||||
|
||||
**REMOVE from `<w:rPr>`:**
|
||||
- `<w:rFonts w:ascii="..." w:hAnsi="..."/>` — Latin font overrides (EXCEPT: keep `w:eastAsia`)
|
||||
- `<w:sz>`, `<w:szCs>` — font size (let style control)
|
||||
- `<w:color>` — text color
|
||||
- `<w:highlight>` — highlight color
|
||||
- `<w:shd>` — shading
|
||||
- `<w:b>`, `<w:i>` — bold/italic UNLESS the source style requires it (e.g., emphasis)
|
||||
- `<w:u>` — underline
|
||||
- `<w:spacing>` — character spacing
|
||||
|
||||
**KEEP in `<w:rPr>`:**
|
||||
- `<w:rFonts w:eastAsia="宋体"/>` — CJK font declaration (MUST keep, or Chinese text renders wrong)
|
||||
- `<w:rFonts w:eastAsia="华文中宋"/>` — same reason
|
||||
- Anything inside `<w:drawing>` — image references (handle separately via rId remapping)
|
||||
|
||||
**REMOVE from `<w:pPr>`:**
|
||||
- `<w:pBdr>` — paragraph borders
|
||||
- `<w:shd>` — paragraph shading
|
||||
- `<w:spacing>` — line/paragraph spacing (let style control)
|
||||
- `<w:jc>` — justification (let style control)
|
||||
- `<w:tabs>` — custom tab stops
|
||||
- `<w:rPr>` inside pPr — default run formatting for the paragraph
|
||||
|
||||
**KEEP in `<w:pPr>`:**
|
||||
- `<w:pStyle>` — style reference (after mapping to template's styleId)
|
||||
- `<w:sectPr>` — section properties (if intentionally inserting section breaks)
|
||||
- `<w:numPr>` — numbering reference (after mapping numId to template's numbering)
|
||||
|
||||
**Table cells (`<w:tc>`):**
|
||||
Apply the same rPr/pPr cleanup to every paragraph inside every cell. Also:
|
||||
- Keep `<w:tcPr>` structural properties (column span, row span, width)
|
||||
- Remove `<w:tcPr><w:shd>` (cell shading — let table style control)
|
||||
|
||||
---
|
||||
|
||||
## Relationship ID Remapping
|
||||
|
||||
When copying parts (headers, footers, images) from the template into the source package, relationship IDs (`r:id`) may collide.
|
||||
|
||||
**Problem**:
|
||||
- Source has `rId7` → `image1.png`
|
||||
- Template has `rId7` → `header1.xml`
|
||||
- Copying template's `rId7` overwrites source's image reference
|
||||
|
||||
**Solution**:
|
||||
1. Scan source's `document.xml.rels` for all existing `rId` values
|
||||
2. Find the maximum numeric ID (e.g., `rId12`)
|
||||
3. Remap all template relationship IDs starting from `rId13`
|
||||
4. Update all references in copied parts to use new IDs
|
||||
|
||||
```xml
|
||||
<!-- Template original -->
|
||||
<Relationship Id="rId1" Type="...header" Target="header1.xml" />
|
||||
|
||||
<!-- After remapping into source package -->
|
||||
<Relationship Id="rId13" Type="...header" Target="header1.xml" />
|
||||
|
||||
<!-- Update sectPr reference -->
|
||||
<w:headerReference w:type="default" r:id="rId13" />
|
||||
```
|
||||
|
||||
### Hyperlink Relationship Merging
|
||||
|
||||
When the source document contains external hyperlinks (e.g., URLs in references or footnotes), these are stored as relationships in `word/_rels/document.xml.rels`:
|
||||
|
||||
```xml
|
||||
<Relationship Id="rId15" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
|
||||
Target="https://example.com/paper" TargetMode="External"/>
|
||||
```
|
||||
|
||||
The corresponding text in document.xml references this rId:
|
||||
```xml
|
||||
<w:hyperlink r:id="rId15">
|
||||
<w:r><w:t>https://example.com/paper</w:t></w:r>
|
||||
</w:hyperlink>
|
||||
```
|
||||
|
||||
**Merging steps:**
|
||||
1. Scan source document.xml for all `<w:hyperlink r:id="...">` elements
|
||||
2. For each, find the corresponding relationship in source's rels file
|
||||
3. Check if template already has a relationship with the same Target URL
|
||||
- If yes: reuse the existing rId, update the hyperlink reference
|
||||
- If no: assign a new rId (starting from template's max rId + 1), add the relationship to template's rels, update the hyperlink reference
|
||||
4. Also check for hyperlink relationships used in footnotes (`word/_rels/footnotes.xml.rels`) and endnotes
|
||||
|
||||
**Common mistake:** Copying hyperlink paragraphs without merging rels → hyperlinks silently break (clicking does nothing in Word).
|
||||
|
||||
---
|
||||
|
||||
## XSD Gate-Check
|
||||
|
||||
### What It Is
|
||||
|
||||
After template application, the output document **MUST** pass `business-rules.xsd` validation. This is a **hard gate** — if it fails, the document is **NOT deliverable**.
|
||||
|
||||
### What business-rules.xsd Checks
|
||||
|
||||
| Rule | What It Validates |
|
||||
|------|-------------------|
|
||||
| Template styles exist | All styles referenced by content paragraphs are defined in `styles.xml` |
|
||||
| Margins match | Page margins match template specification |
|
||||
| Fonts correct | `w:docDefaults` fonts match template's font scheme |
|
||||
| Heading hierarchy | Heading levels are sequential (no H1 → H3 without H2) |
|
||||
| Required styles present | `Normal`, `Heading1`-`Heading3`, `TableGrid` exist |
|
||||
| Page size | Matches template's declared page size |
|
||||
|
||||
### Handling Failures
|
||||
|
||||
```
|
||||
GATE-CHECK FAILED:
|
||||
- Style 'CustomStyle1' referenced in paragraph 14 but not defined in styles.xml
|
||||
- Margin w:left=1080 does not match template requirement 1440
|
||||
```
|
||||
|
||||
Fix each failure:
|
||||
1. **Missing style**: Add the style definition to `styles.xml`, or remap the paragraph to an existing style
|
||||
2. **Margin mismatch**: Update `w:sectPr` margins to match template
|
||||
3. **Font mismatch**: Update `w:docDefaults` to match template font scheme
|
||||
4. **Heading hierarchy gap**: Insert intermediate heading levels or adjust existing levels
|
||||
|
||||
Re-validate after every fix until gate-check passes.
|
||||
|
||||
---
|
||||
|
||||
## Common Pitfalls
|
||||
|
||||
### 1. Orphaned Numbering References
|
||||
|
||||
**Problem**: Source document uses `w:numId="5"` in list paragraphs, but after replacing `numbering.xml` with the template's version, numbering ID 5 doesn't exist.
|
||||
|
||||
**Symptom**: Lists appear as plain paragraphs (no bullets/numbers).
|
||||
|
||||
**Fix**:
|
||||
- Map source numbering IDs to template numbering IDs
|
||||
- Update all `w:numId` references in document content
|
||||
- Or merge source numbering definitions into template's `numbering.xml`
|
||||
|
||||
### 2. Missing Theme Colors
|
||||
|
||||
**Problem**: Source document's styles reference theme colors (`w:themeColor="accent1"`) that have different values in the template's theme.
|
||||
|
||||
**Symptom**: Colors change unexpectedly (usually acceptable — this IS the point of re-theming). But if a style uses `w:color` with both `w:val` and `w:themeColor`, the theme color wins in Word.
|
||||
|
||||
**Fix**: Review color changes. If specific colors must be preserved, use explicit `w:val` without `w:themeColor`.
|
||||
|
||||
### 3. Section Property Conflicts
|
||||
|
||||
**Problem**: Source document has multiple sections (e.g., portrait + landscape pages), but the template assumes a single section.
|
||||
|
||||
**Symptom**: All sections get the same margins/orientation, breaking landscape pages.
|
||||
|
||||
**Fix**:
|
||||
- Only apply template section properties to the final `w:sectPr` in `w:body`
|
||||
- Preserve intermediate `w:sectPr` elements (inside `w:pPr`) from the source
|
||||
- Or apply template properties to all sections but preserve orientation overrides
|
||||
|
||||
### 4. Embedded Font Conflicts
|
||||
|
||||
**Problem**: Template specifies fonts not available on the target system.
|
||||
|
||||
**Fix**: Either embed fonts in the DOCX (`word/fonts/`) or use web-safe alternatives:
|
||||
- Calibri → available on Windows/Mac/Office online
|
||||
- Arial → universal fallback
|
||||
- Times New Roman → universal serif fallback
|
||||
|
||||
### 5. Broken Style Inheritance
|
||||
|
||||
**Problem**: Template has `Heading1` based on `Normal`, but after applying template, `Normal` has different properties, cascading unwanted changes to headings.
|
||||
|
||||
**Fix**: Verify the `w:basedOn` chain for all critical styles. Ensure base styles are also correctly transferred from template.
|
||||
|
||||
---
|
||||
|
||||
## Verification Checklist
|
||||
|
||||
After template application, verify:
|
||||
|
||||
1. **Content preserved** — text diff shows zero content changes
|
||||
2. **Gate-check passed** — `business-rules.xsd` validation succeeds
|
||||
3. **Styles applied** — headings, body text, tables use template formatting
|
||||
4. **Images intact** — all images render correctly (relationship IDs valid)
|
||||
5. **Lists working** — numbered and bulleted lists display correctly
|
||||
6. **Headers/footers** — template headers/footers appear on all pages
|
||||
7. **Page layout** — margins, page size, orientation match template
|
||||
8. **No corruption** — file opens without errors in Word
|
||||
@@ -0,0 +1,200 @@
|
||||
# Track Changes Guide
|
||||
|
||||
## Overview
|
||||
|
||||
Track Changes in OpenXML uses revision markup elements to record insertions, deletions, and formatting changes. Each revision has a unique ID, author, and timestamp.
|
||||
|
||||
---
|
||||
|
||||
## Insertion: `<w:ins>`
|
||||
|
||||
Wraps runs that were inserted during tracking:
|
||||
|
||||
```xml
|
||||
<w:ins w:id="1" w:author="John Smith" w:date="2026-03-21T10:30:00Z">
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" />
|
||||
<w:sz w:val="22" />
|
||||
</w:rPr>
|
||||
<w:t>This text was inserted.</w:t>
|
||||
</w:r>
|
||||
</w:ins>
|
||||
```
|
||||
|
||||
- `w:id` — unique revision ID (integer, must be unique across document)
|
||||
- `w:author` — free text string identifying the author
|
||||
- `w:date` — ISO 8601 format with timezone: `YYYY-MM-DDTHH:MM:SSZ`
|
||||
- Content inside is normal runs (`w:r`) with optional formatting
|
||||
|
||||
---
|
||||
|
||||
## Deletion: `<w:del>`
|
||||
|
||||
Wraps runs that were deleted during tracking:
|
||||
|
||||
```xml
|
||||
<w:del w:id="2" w:author="John Smith" w:date="2026-03-21T10:31:00Z">
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" />
|
||||
<w:sz w:val="22" />
|
||||
</w:rPr>
|
||||
<w:delText xml:space="preserve">This text was deleted.</w:delText>
|
||||
</w:r>
|
||||
</w:del>
|
||||
```
|
||||
|
||||
**CRITICAL**: Inside `<w:del>`, text MUST use `<w:delText>`, NOT `<w:t>`. Using `<w:t>` inside a deletion is invalid and will cause corruption or unexpected behavior. Word may silently repair it, but other consumers will fail.
|
||||
|
||||
---
|
||||
|
||||
## Formatting Change: `<w:rPrChange>`
|
||||
|
||||
Records that a run's formatting was changed. Placed inside `w:rPr`, it stores the **previous** formatting:
|
||||
|
||||
```xml
|
||||
<w:r>
|
||||
<w:rPr>
|
||||
<w:b /> <!-- Current: bold -->
|
||||
<w:rPrChange w:id="3" w:author="Jane Doe" w:date="2026-03-21T11:00:00Z">
|
||||
<w:rPr>
|
||||
<!-- Previous: not bold (empty rPr means no formatting) -->
|
||||
</w:rPr>
|
||||
</w:rPrChange>
|
||||
</w:rPr>
|
||||
<w:t>This text was made bold.</w:t>
|
||||
</w:r>
|
||||
```
|
||||
|
||||
The outer `w:rPr` holds the **new** (current) formatting. The `w:rPrChange` child holds the **old** (previous) formatting.
|
||||
|
||||
---
|
||||
|
||||
## Paragraph Property Change: `<w:pPrChange>`
|
||||
|
||||
Records paragraph-level formatting changes (alignment, spacing, style):
|
||||
|
||||
```xml
|
||||
<w:pPr>
|
||||
<w:jc w:val="center" /> <!-- Current: centered -->
|
||||
<w:pPrChange w:id="4" w:author="Jane Doe" w:date="2026-03-21T11:05:00Z">
|
||||
<w:pPr>
|
||||
<w:jc w:val="left" /> <!-- Previous: left-aligned -->
|
||||
</w:pPr>
|
||||
</w:pPrChange>
|
||||
</w:pPr>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Revision ID Management
|
||||
|
||||
- Every revision element (`w:ins`, `w:del`, `w:rPrChange`, `w:pPrChange`, `w:tblPrChange`, etc.) requires a `w:id` attribute
|
||||
- IDs must be **unique integers** across the entire document
|
||||
- IDs should be **monotonically increasing** (not strictly required, but expected by Word)
|
||||
- When adding revisions, scan for the current maximum `w:id` and increment from there
|
||||
|
||||
```
|
||||
Existing max ID: 47
|
||||
New insertion: w:id="48"
|
||||
New deletion: w:id="49"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Author and Date
|
||||
|
||||
- **Author**: Free text. Use consistent strings (e.g., `"MiniMaxAI"` for all automated edits)
|
||||
- **Date**: ISO 8601 with UTC timezone marker: `2026-03-21T10:30:00Z`
|
||||
- Must include the `T` separator and `Z` suffix (or `+HH:MM` offset)
|
||||
- Omitting the date is allowed but not recommended
|
||||
|
||||
---
|
||||
|
||||
## Operations
|
||||
|
||||
### Propose Insertion
|
||||
|
||||
Add `<w:ins>` wrapper around new content at the target location:
|
||||
|
||||
```xml
|
||||
<w:p>
|
||||
<w:r><w:t>Existing text. </w:t></w:r>
|
||||
<w:ins w:id="5" w:author="MiniMaxAI" w:date="2026-03-21T12:00:00Z">
|
||||
<w:r><w:t>Proposed new text. </w:t></w:r>
|
||||
</w:ins>
|
||||
<w:r><w:t>More existing text.</w:t></w:r>
|
||||
</w:p>
|
||||
```
|
||||
|
||||
### Propose Deletion
|
||||
|
||||
Wrap existing content in `<w:del>` and change `<w:t>` to `<w:delText>`:
|
||||
|
||||
```xml
|
||||
<w:p>
|
||||
<w:r><w:t>Keep this. </w:t></w:r>
|
||||
<w:del w:id="6" w:author="MiniMaxAI" w:date="2026-03-21T12:01:00Z">
|
||||
<w:r>
|
||||
<w:rPr><w:b /></w:rPr>
|
||||
<w:delText>Remove this.</w:delText>
|
||||
</w:r>
|
||||
</w:del>
|
||||
<w:r><w:t> Keep this too.</w:t></w:r>
|
||||
</w:p>
|
||||
```
|
||||
|
||||
### Accept a Tracked Change
|
||||
|
||||
- **Accept insertion**: Remove the `<w:ins>` wrapper, keep the inner runs as normal content
|
||||
- **Accept deletion**: Remove the entire `<w:del>` element and its content
|
||||
|
||||
### Reject a Tracked Change
|
||||
|
||||
- **Reject insertion**: Remove the entire `<w:ins>` element and its content
|
||||
- **Reject deletion**: Remove the `<w:del>` wrapper, change `<w:delText>` back to `<w:t>`
|
||||
|
||||
---
|
||||
|
||||
## Cross-Paragraph Operations
|
||||
|
||||
### Deleting a Paragraph Break (Merging Paragraphs)
|
||||
|
||||
When tracked deletion spans a paragraph boundary, use `<w:pPrChange>` on the merged paragraph:
|
||||
|
||||
```xml
|
||||
<w:p>
|
||||
<w:pPr>
|
||||
<w:pPrChange w:id="7" w:author="MiniMaxAI" w:date="2026-03-21T12:05:00Z">
|
||||
<w:pPr>
|
||||
<w:pStyle w:val="Normal" />
|
||||
</w:pPr>
|
||||
</w:pPrChange>
|
||||
</w:pPr>
|
||||
<w:r><w:t>First paragraph text. </w:t></w:r>
|
||||
<w:del w:id="8" w:author="MiniMaxAI" w:date="2026-03-21T12:05:00Z">
|
||||
<w:r><w:delText> </w:delText></w:r>
|
||||
</w:del>
|
||||
<w:r><w:t>Second paragraph text (now merged).</w:t></w:r>
|
||||
</w:p>
|
||||
```
|
||||
|
||||
### Inserting a New Paragraph
|
||||
|
||||
The entire new paragraph is wrapped in `<w:ins>`:
|
||||
|
||||
```xml
|
||||
<w:p>
|
||||
<w:pPr>
|
||||
<w:rPr>
|
||||
<w:ins w:id="9" w:author="MiniMaxAI" w:date="2026-03-21T12:10:00Z" />
|
||||
</w:rPr>
|
||||
</w:pPr>
|
||||
<w:ins w:id="10" w:author="MiniMaxAI" w:date="2026-03-21T12:10:00Z">
|
||||
<w:r><w:t>Entirely new paragraph.</w:t></w:r>
|
||||
</w:ins>
|
||||
</w:p>
|
||||
```
|
||||
|
||||
The paragraph mark itself is marked as inserted via `w:ins` inside `w:pPr > w:rPr`.
|
||||
@@ -0,0 +1,506 @@
|
||||
# Troubleshooting Guide — Symptom-Driven
|
||||
|
||||
## How to Use This Guide
|
||||
|
||||
Search by the **SYMPTOM** you observe, not the technical concept. Each entry follows:
|
||||
- **Symptom** — what you see or what the user reports
|
||||
- **Diagnosis** — how to confirm the root cause
|
||||
- **Fix** — exact steps, commands, or code
|
||||
- **Prevention** — how to avoid it next time
|
||||
|
||||
**Quick search keywords:** headings wrong, body text, repair, corrupt, font, tables missing, images missing, TOC broken, update table, page break, section break, hyperlink, numbered list, bullets, margins, page size, Chinese tofu, cover page, track changes, revision marks
|
||||
|
||||
---
|
||||
|
||||
## 1. "All headings look like body text" (Heading Styles Not Applied)
|
||||
|
||||
**Symptom:** After template application, headings have no formatting — they look like Normal paragraphs. Font size, bold, spacing are all wrong.
|
||||
|
||||
**Diagnosis:** The `pStyle` values in `document.xml` don't match the `styleId` values in `styles.xml`.
|
||||
|
||||
Common mismatches:
|
||||
- Source uses `Heading1` but template defines the style as `1` (Chinese templates often use numeric styleIds)
|
||||
- Source uses `heading1` (lowercase) but template has `Heading1` (case-sensitive!)
|
||||
- `pStyle` references a style that simply doesn't exist in the output's `styles.xml`
|
||||
|
||||
Check with:
|
||||
```bash
|
||||
# List all pStyle values used in the document
|
||||
$CLI analyze --input output.docx | grep -i "pStyle"
|
||||
|
||||
# List all styleIds defined in styles.xml
|
||||
$CLI analyze --input template.docx --part styles | grep "styleId"
|
||||
```
|
||||
|
||||
**Fix:** Build a styleId mapping table before applying the template. Update every `pStyle` value in the document content.
|
||||
|
||||
```csharp
|
||||
// Build mapping: source styleId → template styleId
|
||||
var mapping = new Dictionary<string, string>();
|
||||
// Compare by style name (w:name), not by styleId
|
||||
foreach (var srcStyle in sourceStyles)
|
||||
{
|
||||
var templateStyle = templateStyles.FirstOrDefault(
|
||||
s => s.StyleName?.Val?.Value == srcStyle.StyleName?.Val?.Value);
|
||||
if (templateStyle != null)
|
||||
mapping[srcStyle.StyleId!] = templateStyle.StyleId!;
|
||||
}
|
||||
|
||||
// Apply mapping to all paragraphs
|
||||
foreach (var para in body.Descendants<Paragraph>())
|
||||
{
|
||||
var pStyle = para.ParagraphProperties?.ParagraphStyleId;
|
||||
if (pStyle != null && mapping.TryGetValue(pStyle.Val!, out var newId))
|
||||
pStyle.Val = newId;
|
||||
}
|
||||
```
|
||||
|
||||
**Prevention:** ALWAYS extract and compare styleIds from both source and template before template application. Never assume styleIds are the same across documents.
|
||||
|
||||
---
|
||||
|
||||
## 2. "Document opens with repair warnings" (XML Corruption)
|
||||
|
||||
**Symptom:** Word says "We found a problem with some content" or "Word found unreadable content" when opening.
|
||||
|
||||
**Diagnosis:** Element ordering is wrong. OpenXML is strict about child element order.
|
||||
|
||||
Common violations:
|
||||
- `pPr` must come before runs in `w:p`
|
||||
- `tblPr` must come before `tblGrid` in `w:tbl`
|
||||
- `rPr` must come before `t`/`br`/`tab` in `w:r`
|
||||
- `trPr` must come before `tc` in `w:tr`
|
||||
- `tcPr` must come before content in `w:tc`
|
||||
|
||||
```bash
|
||||
# Validate to find ordering issues
|
||||
$CLI validate --input doc.docx --xsd assets/xsd/wml-subset.xsd
|
||||
|
||||
# Auto-fix element ordering
|
||||
$CLI fix-order --input doc.docx
|
||||
|
||||
# Re-validate
|
||||
$CLI validate --input doc.docx --xsd assets/xsd/wml-subset.xsd
|
||||
```
|
||||
|
||||
**Fix:**
|
||||
```bash
|
||||
$CLI fix-order --input doc.docx
|
||||
```
|
||||
|
||||
If auto-fix doesn't resolve it, unpack and inspect manually:
|
||||
```bash
|
||||
$CLI unpack --input doc.docx --output unpacked/
|
||||
# Check word/document.xml for ordering issues
|
||||
# Fix, then repack:
|
||||
$CLI pack --input unpacked/ --output fixed.docx
|
||||
```
|
||||
|
||||
**Prevention:** Read `references/openxml_element_order.md` before writing any XML manipulation code. Always append properties elements first, then content elements.
|
||||
|
||||
---
|
||||
|
||||
## 3. "All text is in wrong font" (Font Contamination)
|
||||
|
||||
**Symptom:** Template specifies 宋体/Times New Roman but document shows Google Sans, Arial, Calibri, or whatever font the source document used.
|
||||
|
||||
**Diagnosis:** Source document's `rPr` contains inline `rFonts` declarations that override template styles. Direct formatting always wins over style-based formatting in OpenXML.
|
||||
|
||||
```bash
|
||||
# Check for font contamination
|
||||
$CLI analyze --input output.docx | grep -i "font"
|
||||
# Look for rFonts in the content — if present, they're overriding styles
|
||||
```
|
||||
|
||||
**Fix:** Strip `rFonts` from `rPr` when copying content, but KEEP `w:eastAsia` for CJK text:
|
||||
|
||||
```csharp
|
||||
foreach (var rPr in body.Descendants<RunProperties>())
|
||||
{
|
||||
var rFonts = rPr.GetFirstChild<RunFonts>();
|
||||
if (rFonts != null)
|
||||
{
|
||||
// Preserve EastAsia font for CJK — removing it causes tofu (□□□)
|
||||
var eastAsia = rFonts.EastAsia?.Value;
|
||||
rFonts.Remove();
|
||||
|
||||
// Re-add only eastAsia if it was set and text contains CJK
|
||||
if (!string.IsNullOrEmpty(eastAsia))
|
||||
{
|
||||
rPr.Append(new RunFonts { EastAsia = eastAsia });
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Also strip these common direct formatting overrides:
|
||||
- `w:sz` / `w:szCs` (font size)
|
||||
- `w:color` (text color)
|
||||
- `w:b` / `w:i` when they contradict the style
|
||||
|
||||
**Prevention:** Always clean direct formatting when copying content between documents. Keep only `pStyle`/`rStyle` references and `w:t` text.
|
||||
|
||||
---
|
||||
|
||||
## 4. "Tables are missing" (Tables Lost During Copy)
|
||||
|
||||
**Symptom:** Source had 5 tables but output only has 2 (or 0).
|
||||
|
||||
**Diagnosis:** Code used `body.findall('w:p')` or `body.Descendants<Paragraph>()` at the top level instead of iterating all children. This skips `w:tbl` elements.
|
||||
|
||||
```bash
|
||||
# Verify table count
|
||||
$CLI analyze --input source.docx | grep -i "table"
|
||||
$CLI analyze --input output.docx | grep -i "table"
|
||||
```
|
||||
|
||||
**Fix:** Use `list(body)` or `body.ChildElements` to get ALL top-level children including tables:
|
||||
|
||||
```csharp
|
||||
// WRONG — skips tables, section properties, and other non-paragraph elements
|
||||
var paragraphs = body.Elements<Paragraph>();
|
||||
|
||||
// CORRECT — gets everything: paragraphs, tables, SDT blocks, etc.
|
||||
var allElements = body.ChildElements.ToList();
|
||||
```
|
||||
|
||||
In Python with lxml:
|
||||
```python
|
||||
# WRONG
|
||||
elements = body.findall('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p')
|
||||
|
||||
# CORRECT
|
||||
elements = list(body) # all direct children
|
||||
```
|
||||
|
||||
**Prevention:** Always use `list(body)` or `body.ChildElements` for iteration, never filter by a single element type alone when copying content.
|
||||
|
||||
---
|
||||
|
||||
## 5. "Images are missing or show broken icon"
|
||||
|
||||
**Symptom:** Image placeholders appear but images don't render. Or images are completely absent.
|
||||
|
||||
**Diagnosis:** The `r:embed` rId in `w:drawing` doesn't match any relationship in `document.xml.rels`, or the media file wasn't copied to the output ZIP.
|
||||
|
||||
```bash
|
||||
# Check relationships
|
||||
$CLI analyze --input output.docx --part rels | grep -i "image"
|
||||
|
||||
# Check if media files exist
|
||||
$CLI unpack --input output.docx --output unpacked/
|
||||
ls unpacked/word/media/
|
||||
```
|
||||
|
||||
**Fix:**
|
||||
1. Check source rels for image file paths
|
||||
2. Copy media files from source to output
|
||||
3. Add/update relationships in output rels
|
||||
4. Update `r:embed` values in drawing elements
|
||||
|
||||
```csharp
|
||||
// When copying content with images between documents:
|
||||
foreach (var drawing in body.Descendants<Drawing>())
|
||||
{
|
||||
var blip = drawing.Descendants<DocumentFormat.OpenXml.Drawing.Blip>().FirstOrDefault();
|
||||
if (blip?.Embed?.Value != null)
|
||||
{
|
||||
var sourceRel = sourcePart.GetReferenceRelationship(blip.Embed.Value);
|
||||
// Copy the image part to the target document
|
||||
var imagePart = targetPart.AddImagePart(ImagePartType.Png);
|
||||
using var stream = sourcePart.GetPartById(blip.Embed.Value).GetStream();
|
||||
imagePart.FeedData(stream);
|
||||
// Update the rId reference
|
||||
blip.Embed = targetPart.GetIdOfPart(imagePart);
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Prevention:** Always do rId remapping + media file copy when moving content between documents. Never assume rIds are portable across documents.
|
||||
|
||||
---
|
||||
|
||||
## 6. "TOC shows stale/wrong entries" or "Update Table doesn't work"
|
||||
|
||||
**Symptom:** Table of contents shows the template's example entries (e.g., "第1章 绪论...1") instead of actual headings. Or clicking "Update Table" in Word does nothing.
|
||||
|
||||
**Diagnosis:**
|
||||
- **Stale entries (normal):** TOC entries are static text cached inside the field. They don't auto-update until the user explicitly updates in Word.
|
||||
- **Update Table fails:** The SDT wrapper or field code structure is damaged. The TOC in real templates is a mixed structure: SDT block + field code + static entries.
|
||||
|
||||
```bash
|
||||
# Check if TOC SDT exists
|
||||
$CLI analyze --input output.docx | grep -i "sdt\|toc"
|
||||
```
|
||||
|
||||
**Fix:**
|
||||
- **If entries are just stale:** This is expected behavior. The user must right-click TOC, then "Update Field" in Word. Or enable auto-update:
|
||||
```csharp
|
||||
// See FieldAndTocSamples.EnableUpdateFieldsOnOpen()
|
||||
FieldAndTocSamples.EnableUpdateFieldsOnOpen(settingsPart);
|
||||
```
|
||||
- **If SDT is damaged:** Keep the entire SDT block from the template intact. Do not modify it.
|
||||
- **If field code is missing:** Ensure the TOC contains: `fldChar begin` + `instrText` + `fldChar separate` + static entries + `fldChar end`. See `FieldAndTocSamples.CreateMixedTocStructure()` for the complete pattern.
|
||||
- **If you rebuilt TOC from scratch (common mistake):** You likely destroyed the SDT wrapper. Use the template's original SDT block instead. See `Samples/FieldAndTocSamples.cs` method `CreateMixedTocStructure` for how real-world TOC is structured.
|
||||
|
||||
**Prevention:** When doing Base-Replace (C-2), keep the template's TOC zone completely untouched. Do not strip, rebuild, or modify the SDT block. The TOC will auto-update when the user opens in Word.
|
||||
|
||||
---
|
||||
|
||||
## 7. "Chapters don't start on new pages" (Missing Section Breaks)
|
||||
|
||||
**Symptom:** Content flows continuously without page breaks between chapters. Chapter 2 starts right after Chapter 1's last paragraph on the same page.
|
||||
|
||||
**Diagnosis:** No `sectPr` elements or page break paragraphs between chapters.
|
||||
|
||||
**Fix:** Insert a paragraph with `sectPr` in its `pPr` before each chapter heading, or insert a page break:
|
||||
|
||||
```csharp
|
||||
// Option 1: Section break (preserves per-section settings like headers/margins)
|
||||
var breakPara = new Paragraph(
|
||||
new ParagraphProperties(
|
||||
new SectionProperties(
|
||||
new SectionType { Val = SectionMarkValues.NextPage })));
|
||||
|
||||
// Option 2: Simple page break (lighter weight)
|
||||
var breakPara = new Paragraph(
|
||||
new Run(new Break { Type = BreakValues.Page }));
|
||||
|
||||
// Insert before each Heading1
|
||||
body.InsertBefore(breakPara, heading1Paragraph);
|
||||
```
|
||||
|
||||
**Prevention:** When copying content, insert page/section breaks before Heading1 paragraphs as needed. Check source document's section structure before copying.
|
||||
|
||||
---
|
||||
|
||||
## 8. "Hyperlinks don't work" (Broken Links)
|
||||
|
||||
**Symptom:** Clicking a hyperlink in the output document does nothing, or it navigates to the wrong URL.
|
||||
|
||||
**Diagnosis:** `w:hyperlink r:id` points to a relationship that doesn't exist in `document.xml.rels`.
|
||||
|
||||
```bash
|
||||
# Check hyperlink relationships
|
||||
$CLI analyze --input output.docx --part rels | grep -i "hyperlink"
|
||||
```
|
||||
|
||||
**Fix:** Merge source document's hyperlink relationships into output's rels file. Update rId references.
|
||||
|
||||
```csharp
|
||||
foreach (var hyperlink in body.Descendants<Hyperlink>())
|
||||
{
|
||||
if (hyperlink.Id?.Value != null)
|
||||
{
|
||||
var sourceRel = sourcePart.HyperlinkRelationships
|
||||
.FirstOrDefault(r => r.Id == hyperlink.Id.Value);
|
||||
if (sourceRel != null)
|
||||
{
|
||||
targetPart.AddHyperlinkRelationship(sourceRel.Uri, sourceRel.IsExternal);
|
||||
var newRel = targetPart.HyperlinkRelationships.Last();
|
||||
hyperlink.Id = newRel.Id;
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Prevention:** Always merge ALL relationship types (images, hyperlinks, headers, footers) when combining documents. Never assume source rIds work in the target.
|
||||
|
||||
---
|
||||
|
||||
## 9. "Numbered lists show wrong numbers" or "Bullets disappeared"
|
||||
|
||||
**Symptom:** Lists that were numbered 1, 2, 3 now show 1, 1, 1 or have no numbers/bullets at all.
|
||||
|
||||
**Diagnosis:** `numId` in `pPr` references a numbering definition that doesn't exist in `numbering.xml`, or `abstractNumId` mapping is broken.
|
||||
|
||||
```bash
|
||||
# Check numbering definitions
|
||||
$CLI analyze --input output.docx --part numbering
|
||||
```
|
||||
|
||||
**Fix:** Map source numIds to template numIds, or merge numbering definitions:
|
||||
|
||||
```csharp
|
||||
// 1. Copy abstractNum definitions from source to target numbering.xml
|
||||
// 2. Create new num entries pointing to the copied abstractNum
|
||||
// 3. Update all numId references in document content
|
||||
|
||||
var sourceNumbering = sourceNumberingPart.Numbering;
|
||||
var targetNumbering = targetNumberingPart.Numbering;
|
||||
|
||||
// Get max existing IDs to avoid collisions
|
||||
int maxAbstractNumId = targetNumbering.Elements<AbstractNum>()
|
||||
.Max(a => a.AbstractNumberId?.Value ?? 0) + 1;
|
||||
int maxNumId = targetNumbering.Elements<NumberingInstance>()
|
||||
.Max(n => n.NumberID?.Value ?? 0) + 1;
|
||||
```
|
||||
|
||||
**Prevention:** Include `numbering.xml` reconciliation in template application workflow. See `Samples/ListAndNumberingSamples.cs` for correct numbering setup.
|
||||
|
||||
---
|
||||
|
||||
## 10. "Page margins/size are wrong"
|
||||
|
||||
**Symptom:** Output has different margins, page size, or orientation than the template.
|
||||
|
||||
**Diagnosis:** Source document's `sectPr` is overriding the template's `sectPr`. The final `sectPr` (child of `body`) controls the last section's layout.
|
||||
|
||||
```bash
|
||||
# Compare section properties
|
||||
$CLI analyze --input template.docx | grep -i "sectPr\|margin\|pgSz"
|
||||
$CLI analyze --input output.docx | grep -i "sectPr\|margin\|pgSz"
|
||||
```
|
||||
|
||||
**Fix:** Use the template's final `sectPr`. For intermediate `sectPr` elements (multi-section documents), merge carefully.
|
||||
|
||||
```csharp
|
||||
// Replace output's final sectPr with template's
|
||||
var templateSectPr = templateBody.Elements<SectionProperties>().LastOrDefault();
|
||||
var outputSectPr = outputBody.Elements<SectionProperties>().LastOrDefault();
|
||||
|
||||
if (templateSectPr != null)
|
||||
{
|
||||
var cloned = templateSectPr.CloneNode(true) as SectionProperties;
|
||||
if (outputSectPr != null)
|
||||
outputBody.ReplaceChild(cloned!, outputSectPr);
|
||||
else
|
||||
outputBody.Append(cloned!);
|
||||
}
|
||||
```
|
||||
|
||||
**Prevention:** Always use the template's `sectPr` as authority for page layout. Strip source document's `sectPr` before copying content.
|
||||
|
||||
---
|
||||
|
||||
## 11. "Chinese text renders as boxes/tofu"
|
||||
|
||||
**Symptom:** Chinese characters display as square boxes (□□□) or missing glyphs.
|
||||
|
||||
**Diagnosis:** `rFonts w:eastAsia` is set to a font that doesn't exist on the system, or is missing entirely. Without an East Asian font declaration, the rendering engine may fall back to a font without CJK coverage.
|
||||
|
||||
**Fix:** Ensure all CJK text has `w:eastAsia` set to an available font:
|
||||
|
||||
```csharp
|
||||
foreach (var run in body.Descendants<Run>())
|
||||
{
|
||||
var text = run.InnerText;
|
||||
if (ContainsCjk(text))
|
||||
{
|
||||
var rPr = run.RunProperties ?? new RunProperties();
|
||||
var rFonts = rPr.GetFirstChild<RunFonts>();
|
||||
if (rFonts == null)
|
||||
{
|
||||
rFonts = new RunFonts();
|
||||
rPr.Append(rFonts);
|
||||
}
|
||||
// Set to a universally available CJK font
|
||||
rFonts.EastAsia = "SimSun"; // 宋体 — safest default
|
||||
if (run.RunProperties == null) run.PrependChild(rPr);
|
||||
}
|
||||
}
|
||||
|
||||
static bool ContainsCjk(string text)
|
||||
{
|
||||
return text.Any(c => c >= 0x4E00 && c <= 0x9FFF);
|
||||
}
|
||||
```
|
||||
|
||||
Common safe CJK fonts: 宋体 (SimSun), 黑体 (SimHei), 仿宋 (FangSong), 楷体 (KaiTi).
|
||||
|
||||
**Prevention:** When cleaning `rPr` formatting, ALWAYS preserve `w:eastAsia` font declarations. See also `references/cjk_typography.md`.
|
||||
|
||||
---
|
||||
|
||||
## 12. "Template's cover page / declaration page is missing"
|
||||
|
||||
**Symptom:** Output document starts directly with body content — no cover page, no declaration, no abstract, no table of contents. The template's structural front matter was discarded.
|
||||
|
||||
**Diagnosis:** Used Overlay (C-1) strategy when Base-Replace (C-2) was needed. Overlay applies styles to the source document but discards the template's structural content (cover, declaration, abstract, TOC).
|
||||
|
||||
```bash
|
||||
# Check template structure
|
||||
$CLI analyze --input template.docx
|
||||
# If template has >50 paragraphs with cover/TOC/declaration, C-2 is needed
|
||||
```
|
||||
|
||||
**Fix:** Use Base-Replace (C-2) strategy — template is the base, only replace the example body content zone with the user's content:
|
||||
|
||||
1. Identify the template's "body zone" (everything between TOC and final sectPr)
|
||||
2. Remove the template's example body content
|
||||
3. Insert the user's content into the body zone
|
||||
4. Keep everything else from the template (cover, declaration, abstract, TOC, sectPr)
|
||||
|
||||
```bash
|
||||
$CLI apply-template --input source.docx --template template.docx --output out.docx --strategy base-replace
|
||||
```
|
||||
|
||||
**Prevention:** Analyze template structure FIRST. If template has structural content (cover, TOC, declaration sections), always use C-2 (Base-Replace). Read `references/scenario_c_apply_template.md` for detailed decision criteria.
|
||||
|
||||
---
|
||||
|
||||
## 13. "Track changes markers appear unexpectedly"
|
||||
|
||||
**Symptom:** Output shows red/green revision marks (insertions, deletions) that weren't in the source document.
|
||||
|
||||
**Diagnosis:** Template had track changes enabled, or content was inserted as revisions rather than normal text.
|
||||
|
||||
```bash
|
||||
# Check for revision marks
|
||||
$CLI analyze --input output.docx | grep -i "revision\|ins\|del\|track"
|
||||
```
|
||||
|
||||
**Fix:** Accept all revisions by flattening `w:ins` and `w:del` elements:
|
||||
|
||||
```csharp
|
||||
// Accept insertions: unwrap w:ins, keep content
|
||||
foreach (var ins in body.Descendants<InsertedRun>().ToList())
|
||||
{
|
||||
var parent = ins.Parent!;
|
||||
foreach (var child in ins.ChildElements.ToList())
|
||||
{
|
||||
parent.InsertBefore(child.CloneNode(true), ins);
|
||||
}
|
||||
ins.Remove();
|
||||
}
|
||||
|
||||
// Accept deletions: remove w:del and its content entirely
|
||||
foreach (var del in body.Descendants<DeletedRun>().ToList())
|
||||
{
|
||||
del.Remove();
|
||||
}
|
||||
```
|
||||
|
||||
Or disable tracking in settings:
|
||||
```csharp
|
||||
var settings = settingsPart.Settings;
|
||||
var trackChanges = settings.GetFirstChild<TrackChanges>();
|
||||
trackChanges?.Remove();
|
||||
```
|
||||
|
||||
**Prevention:** Check template's `settings.xml` for `trackChanges` before starting. If present, accept all revisions in the template first.
|
||||
|
||||
---
|
||||
|
||||
## Recovery Strategy — When Multiple Issues Exist
|
||||
|
||||
When a document has multiple problems, fix them in this priority order:
|
||||
|
||||
```
|
||||
1. [Content_Types].xml — without this, nothing opens
|
||||
2. _rels/.rels — package relationships
|
||||
3. word/_rels/document.xml.rels — part relationships (images, hyperlinks)
|
||||
4. word/document.xml — element ordering (fix-order)
|
||||
5. word/styles.xml — style definitions and styleId mapping
|
||||
6. word/numbering.xml — list/numbering definitions
|
||||
7. Everything else — headers, footers, comments, settings
|
||||
```
|
||||
|
||||
```bash
|
||||
# Full recovery pipeline
|
||||
$CLI unpack --input broken.docx --output unpacked/
|
||||
$CLI validate --input broken.docx --xsd assets/xsd/wml-subset.xsd # find all errors
|
||||
$CLI fix-order --input broken.docx # fix element ordering
|
||||
$CLI validate --input broken.docx --business # check business rules
|
||||
scripts/docx_preview.sh broken.docx # visual check
|
||||
```
|
||||
@@ -0,0 +1,294 @@
|
||||
# Professional Document Design & Typography Guide
|
||||
|
||||
## Table of Contents
|
||||
1. [Font Pairing](#font-pairing)
|
||||
2. [Font Sizes by Document Type](#font-sizes-by-document-type)
|
||||
3. [Line Spacing](#line-spacing)
|
||||
4. [Paragraph Spacing](#paragraph-spacing)
|
||||
5. [Page Layout](#page-layout)
|
||||
6. [Table Design](#table-design)
|
||||
7. [Color Schemes](#color-schemes)
|
||||
8. [Visual Hierarchy](#visual-hierarchy)
|
||||
9. [Quick Reference Defaults](#quick-reference-defaults)
|
||||
|
||||
---
|
||||
|
||||
## Font Pairing
|
||||
|
||||
### Recommended Pairs
|
||||
|
||||
| Headings | Body | Style | Best For |
|
||||
|----------|------|-------|----------|
|
||||
| Calibri Light | Calibri | Modern sans | Corporate reports |
|
||||
| Aptos | Aptos | Office 365 default | Modern business docs |
|
||||
| Cambria | Calibri | Serif + sans | Academic-corporate hybrid |
|
||||
| Times New Roman | Times New Roman | Traditional serif | Academic, legal |
|
||||
| Arial | Arial | Clean sans | Memos, internal docs |
|
||||
| Georgia | Garamond | Classical serif pair | Formal reports |
|
||||
|
||||
### Rules
|
||||
|
||||
- **Limit**: 2 font families max (3 if CJK mixed)
|
||||
- **Contrast**: Pair serif with sans-serif, OR use weight contrast within one family
|
||||
- **Consistency**: Same font for all body text, same font for all headings
|
||||
|
||||
---
|
||||
|
||||
## Font Sizes by Document Type
|
||||
|
||||
| Document Type | Body | H1 | H2 | H3 | Footnotes |
|
||||
|--------------|------|----|----|----|----|
|
||||
| **Business report** | 11pt | 18-20pt | 14-16pt | 12-13pt bold | 9pt |
|
||||
| **Business letter** | 11-12pt | — | — | — | 9-10pt |
|
||||
| **Memo** | 11pt | 14pt bold | 12pt bold | 11pt bold | 9pt |
|
||||
| **Contract / Legal** | 12pt | 14pt bold caps | 12pt bold | 12pt bold | 10pt |
|
||||
| **Academic (APA 7)** | 12pt | 12pt bold center | 12pt bold left | 12pt bold italic | 10pt |
|
||||
| **Resume / CV** | 10-11pt | 14-16pt | 12pt bold | 11pt bold | 8-9pt |
|
||||
| **Chinese 公文** | 三号(16pt) | 二号(22pt) | 三号(16pt) | 四号(14pt) | 小四(12pt) |
|
||||
|
||||
### OpenXML `w:sz` Values (half-points)
|
||||
|
||||
| Point Size | `w:sz` Val | Common Use |
|
||||
|-----------|-----------|------------|
|
||||
| 9pt | 18 | Footnotes, captions |
|
||||
| 10pt | 20 | Compact body text |
|
||||
| 10.5pt (五号) | 21 | CJK body small |
|
||||
| 11pt | 22 | Standard body (Calibri) |
|
||||
| 12pt (小四) | 24 | Standard body (TNR), CJK |
|
||||
| 14pt (四号) | 28 | CJK body, subheading |
|
||||
| 16pt (三号) | 32 | CJK heading, western H2 |
|
||||
| 18pt (小二) | 36 | Western H1 |
|
||||
| 22pt (二号) | 44 | CJK document title |
|
||||
| 26pt (一号) | 52 | Large title |
|
||||
|
||||
---
|
||||
|
||||
## Line Spacing
|
||||
|
||||
| Spacing | OpenXML `w:spacing line` | When to Use |
|
||||
|---------|--------------------------|-------------|
|
||||
| Single (1.0) | `line="240"` lineRule="auto" | Tables, footnotes, captions |
|
||||
| 1.08 (MS default) | `line="259"` lineRule="auto" | Modern Office documents |
|
||||
| 1.15 | `line="276"` lineRule="auto" | Business reports — best general default |
|
||||
| 1.5 | `line="360"` lineRule="auto" | Some academic, drafts for markup |
|
||||
| Double (2.0) | `line="480"` lineRule="auto" | APA/MLA manuscripts, legal briefs |
|
||||
| Fixed 28pt | `line="560"` lineRule="exact" | Chinese 公文 (GB/T 9704) |
|
||||
|
||||
**`lineRule` values**: `auto` = proportional (240 = 1 line), `exact` = fixed height, `atLeast` = minimum.
|
||||
|
||||
---
|
||||
|
||||
## Paragraph Spacing
|
||||
|
||||
| Element | Space Before (DXA) | Space After (DXA) |
|
||||
|---------|-------------------|-------------------|
|
||||
| Body paragraph | 0 | 120-160 (6-8pt) |
|
||||
| Heading 1 | 480 (24pt) | 120-240 |
|
||||
| Heading 2 | 360 (18pt) | 120 |
|
||||
| Heading 3 | 240 (12pt) | 80-120 |
|
||||
| List items | 0 | 40-80 (2-4pt) |
|
||||
| Block quote | 120-240 | 120-240 |
|
||||
| Table/Figure caption | 240 | 240 |
|
||||
|
||||
**Principle**: Space before a heading > space after, so heading visually "belongs to" content below (2:1 or 3:1 ratio).
|
||||
|
||||
---
|
||||
|
||||
## Page Layout
|
||||
|
||||
### Margins by Document Type
|
||||
|
||||
| Document Type | Top | Bottom | Left | Right | DXA Values |
|
||||
|--------------|-----|--------|------|-------|------------|
|
||||
| **Standard business** | 1 in | 1 in | 1 in | 1 in | 1440 all |
|
||||
| **Academic (APA/MLA)** | 1 in | 1 in | 1 in | 1 in | 1440 all |
|
||||
| **Thesis (binding)** | 1 in | 1 in | 1.5 in | 1 in | T/B:1440 L:2160 R:1440 |
|
||||
| **Chinese 公文** | 37mm | 35mm | 28mm | 26mm | T:2098 B:1984 L:1588 R:1474 |
|
||||
| **Narrow modern** | 0.75 in | 0.75 in | 0.75 in | 0.75 in | 1080 all |
|
||||
| **Wide** | 1 in | 1 in | 2 in | 2 in | T/B:1440 L/R:2880 |
|
||||
|
||||
### Page Sizes
|
||||
|
||||
| Size | Width × Height | DXA Width × Height |
|
||||
|------|---------------|-------------------|
|
||||
| US Letter | 8.5 × 11 in | 12240 × 15840 |
|
||||
| A4 | 210 × 297 mm | 11906 × 16838 |
|
||||
| Legal | 8.5 × 14 in | 12240 × 20160 |
|
||||
| A3 | 297 × 420 mm | 16838 × 23811 |
|
||||
|
||||
**Rule**: A4 for international audiences, Letter for US-only.
|
||||
|
||||
### Page Numbers
|
||||
|
||||
| Convention | Placement | Common In |
|
||||
|-----------|-----------|-----------|
|
||||
| Bottom center | Footer, centered | Academic, government |
|
||||
| Bottom right | Footer, right-aligned | Business reports |
|
||||
| "Page X of Y" | Footer, right-aligned | Contracts, legal |
|
||||
| Bottom outside | Alternating L/R for odd/even | Books, bound reports |
|
||||
| Chinese 公文 | Bottom center, format "-X-" | Government documents |
|
||||
|
||||
---
|
||||
|
||||
## Table Design
|
||||
|
||||
### Style Patterns
|
||||
|
||||
| Style | Description | When to Use |
|
||||
|-------|------------|-------------|
|
||||
| **Three-line (三线表)** | Top rule + header-bottom rule + bottom rule only, no vertical lines | Academic, scientific — gold standard |
|
||||
| **Banded rows** | Alternating white/light-gray, no borders | Modern corporate |
|
||||
| **Light grid** | Thin 0.5pt gray borders all cells | Business reports |
|
||||
| **Header-accent** | Dark/colored header row, no other borders | Modern templates |
|
||||
| **Full border** | All cells bordered | Financial tables, forms |
|
||||
|
||||
### Border Weights (OpenXML `w:sz` in eighths of a point)
|
||||
|
||||
| Visual | `Size` value | Points |
|
||||
|--------|-------------|--------|
|
||||
| Hairline | 2 | 0.25pt |
|
||||
| Thin | 4 | 0.5pt |
|
||||
| Medium | 8 | 1pt |
|
||||
| Thick | 12 | 1.5pt |
|
||||
|
||||
### Cell Padding
|
||||
|
||||
- **Minimum**: 0.05 in (28 DXA) — too tight for most uses
|
||||
- **Recommended**: 0.08-0.1 in (57-72 DXA) top/bottom, 0.1-0.15 in (72-108 DXA) left/right
|
||||
- **Spacious**: 0.12 in (86 DXA) top/bottom, 0.19 in (137 DXA) left/right
|
||||
|
||||
### Header Row Best Practices
|
||||
|
||||
- Bold text, optionally SMALL CAPS
|
||||
- Background: light gray (#F2F2F2) or dark with white text (#2F5496 + white)
|
||||
- Repeat header row on each page (`w:tblHeader` on `w:trPr`)
|
||||
- Right-align number columns, left-align text columns
|
||||
|
||||
---
|
||||
|
||||
## Color Schemes
|
||||
|
||||
### Corporate / Business
|
||||
|
||||
| Element | Hex | Notes |
|
||||
|---------|-----|-------|
|
||||
| Primary heading | #1F3864 | Dark navy, authoritative |
|
||||
| Secondary heading | #2E75B6 | Medium blue |
|
||||
| Body text | #333333 | Near-black (softer than #000) |
|
||||
| Table header bg | #4472C4 | With white #FFFFFF text |
|
||||
| Alternate row | #F2F2F2 | Subtle gray banding |
|
||||
| Hyperlink | #0563C1 | Standard blue |
|
||||
|
||||
### Academic
|
||||
|
||||
All text **#000000** (black). Color only in figures/charts.
|
||||
|
||||
### Chinese Government (公文)
|
||||
|
||||
| Element | Color |
|
||||
|---------|-------|
|
||||
| All body text | Black (required) |
|
||||
| 红头 agency name | Red #FF0000 |
|
||||
| 红线 separator | Red #FF0000 |
|
||||
| 公章 seal | Red |
|
||||
|
||||
### Accessibility
|
||||
|
||||
- Minimum contrast ratio 4.5:1 for normal text, 3:1 for large text (WCAG AA)
|
||||
- Never use color as sole means of conveying information
|
||||
- Ensure distinguishable in grayscale for printed documents
|
||||
|
||||
---
|
||||
|
||||
## Visual Hierarchy
|
||||
|
||||
### Heading Levels by Document Length
|
||||
|
||||
| Pages | Recommended Levels |
|
||||
|-------|-------------------|
|
||||
| 1-5 (memo, letter) | 1-2 levels |
|
||||
| 5-20 (report) | 2-3 levels |
|
||||
| 20-100 (long report) | 3-4 levels |
|
||||
| 100+ (thesis) | 4-5 levels max |
|
||||
|
||||
### Numbering Systems
|
||||
|
||||
**Decimal (ISO 2145)** — technical, international:
|
||||
```
|
||||
1 → 1.1 → 1.1.1 → 1.1.1.1
|
||||
```
|
||||
|
||||
**Traditional outline (US legal):**
|
||||
```
|
||||
I. → A. → 1. → a. → (1) → (a)
|
||||
```
|
||||
|
||||
**Chinese government (公文):**
|
||||
```
|
||||
一、(黑体) → (一)(楷体) → 1.(仿宋加粗) → (1)(仿宋)
|
||||
```
|
||||
|
||||
### Typography Emphasis
|
||||
|
||||
| Format | Use For | Avoid |
|
||||
|--------|---------|-------|
|
||||
| **Bold** | Key terms, headings, emphasis | Entire paragraphs |
|
||||
| *Italic* | Titles, foreign words, mild emphasis | Long passages (hard to read) |
|
||||
| Underline | Hyperlinks only (digital) | General emphasis (archaic) |
|
||||
| SMALL CAPS | Legal defined terms, acronyms | Body text |
|
||||
| ALL CAPS | Very short headings | Long text (reduces readability 15%) |
|
||||
|
||||
**CJK note**: Chinese/Japanese have no true italic. Use bold for emphasis.
|
||||
|
||||
### List Formatting
|
||||
|
||||
**Bullets** (unordered): `•` → `○` → `■` by level
|
||||
|
||||
**Numbers** (ordered): `1.` → `a.` → `i.` by level
|
||||
|
||||
- Indent each level 0.25-0.5 in (360-720 DXA)
|
||||
- Hanging indent: number hangs, text aligns consistently
|
||||
- Spacing between items: 2-4pt (less than paragraph spacing)
|
||||
|
||||
---
|
||||
|
||||
## Quick Reference Defaults
|
||||
|
||||
### Business Report (Safe Default)
|
||||
|
||||
| Parameter | Value | OpenXML |
|
||||
|-----------|-------|---------|
|
||||
| Body font | Calibri 11pt | sz="22", RunFonts Ascii="Calibri" |
|
||||
| H1 | 18pt Bold Dark Blue | sz="36", Bold, Color="#1F3864" |
|
||||
| H2 | 14pt Bold Dark Blue | sz="28", Bold |
|
||||
| H3 | 12pt Bold Dark Blue | sz="24", Bold |
|
||||
| Line spacing | 1.15 | line="276" lineRule="auto" |
|
||||
| Para after | 8pt | after="160" |
|
||||
| Margins | 1 in all | 1440 DXA all |
|
||||
| Page size | Letter or A4 | 12240×15840 or 11906×16838 |
|
||||
| Page numbers | Bottom right, 10pt | |
|
||||
|
||||
### Academic Paper (APA 7th)
|
||||
|
||||
| Parameter | Value | OpenXML |
|
||||
|-----------|-------|---------|
|
||||
| Font | Times New Roman 12pt | sz="24" |
|
||||
| Line spacing | Double | line="480" lineRule="auto" |
|
||||
| First-line indent | 0.5 in | ind firstLine="720" |
|
||||
| Margins | 1 in all | 1440 DXA all |
|
||||
| Page numbers | Top right | Header, right-aligned |
|
||||
|
||||
### Chinese Government (公文 GB/T 9704)
|
||||
|
||||
| Parameter | Value | OpenXML |
|
||||
|-----------|-------|---------|
|
||||
| Body font | 仿宋_GB2312 三号 | sz="32", EastAsia="FangSong_GB2312" |
|
||||
| Title | 小标宋 二号 centered | sz="44" |
|
||||
| L1 heading | 黑体 三号 | sz="32", EastAsia="SimHei" |
|
||||
| L2 heading | 楷体 三号 | sz="32", EastAsia="KaiTi_GB2312" |
|
||||
| Line spacing | Fixed 28pt | line="560" lineRule="exact" |
|
||||
| Margins | T:37mm B:35mm L:28mm R:26mm | T:2098 B:1984 L:1588 R:1474 |
|
||||
| Page size | A4 | 11906×16838 |
|
||||
| Page numbers | Bottom center, 宋体 四号, "-X-" | sz="28" |
|
||||
| Chars/line | 28 | |
|
||||
| Lines/page | 22 | |
|
||||
@@ -0,0 +1,158 @@
|
||||
# XSD Validation Guide
|
||||
|
||||
## Running Validation
|
||||
|
||||
```bash
|
||||
# Validate against the WML subset schema
|
||||
dotnet run --project minimax-docx validate input.docx --xsd assets/xsd/wml-subset.xsd
|
||||
|
||||
# Validate against business rules (REQUIRED for Scenario C gate-check)
|
||||
dotnet run --project minimax-docx validate input.docx --xsd assets/xsd/business-rules.xsd
|
||||
|
||||
# Validate against both
|
||||
dotnet run --project minimax-docx validate input.docx --xsd assets/xsd/wml-subset.xsd --xsd assets/xsd/business-rules.xsd
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## What wml-subset.xsd Covers
|
||||
|
||||
The subset schema validates the most common WordprocessingML elements:
|
||||
|
||||
| Area | Elements Validated |
|
||||
|------|--------------------|
|
||||
| Document structure | `w:document`, `w:body`, `w:sectPr` |
|
||||
| Paragraphs | `w:p`, `w:pPr`, `w:r`, `w:rPr`, `w:t` |
|
||||
| Tables | `w:tbl`, `w:tblPr`, `w:tblGrid`, `w:tr`, `w:tc` |
|
||||
| Styles | `w:styles`, `w:style`, `w:docDefaults` |
|
||||
| Lists | `w:numbering`, `w:abstractNum`, `w:num` |
|
||||
| Headers/Footers | `w:hdr`, `w:ftr` |
|
||||
| Track Changes | `w:ins`, `w:del`, `w:rPrChange`, `w:pPrChange` |
|
||||
| Comments | `w:comment`, `w:commentRangeStart`, `w:commentRangeEnd` |
|
||||
|
||||
### What It Does NOT Cover
|
||||
|
||||
- DrawingML elements (`a:`, `pic:`, `wp:`) — image/shape internals
|
||||
- VML elements (`v:`, `o:`) — legacy shapes
|
||||
- Math elements (`m:`) — equations
|
||||
- Extended namespaces (`w14`, `w15`, `w16*`) — vendor extensions
|
||||
- Custom XML data parts
|
||||
- Relationship and content type validation (structural, not schema-based)
|
||||
|
||||
---
|
||||
|
||||
## Interpreting Errors
|
||||
|
||||
### Element Ordering Error
|
||||
|
||||
```
|
||||
ERROR: Element 'w:jc' is not expected at this position.
|
||||
Expected: w:spacing, w:ind, w:contextualSpacing, ...
|
||||
Location: /word/document.xml, line 45
|
||||
```
|
||||
|
||||
**Cause**: Child elements are in wrong order. See `references/openxml_element_order.md`.
|
||||
**Fix**: Reorder children to match schema sequence.
|
||||
|
||||
### Missing Required Element
|
||||
|
||||
```
|
||||
ERROR: Element 'w:tbl' missing required child 'w:tblPr'.
|
||||
Location: /word/document.xml, line 102
|
||||
```
|
||||
|
||||
**Cause**: A required child element is absent.
|
||||
**Fix**: Add the missing element. Tables require both `w:tblPr` and `w:tblGrid`.
|
||||
|
||||
### Invalid Attribute Value
|
||||
|
||||
```
|
||||
ERROR: Attribute 'w:val' has invalid value 'middle'.
|
||||
Expected: 'left', 'center', 'right', 'both', 'distribute'
|
||||
Location: /word/document.xml, line 78
|
||||
```
|
||||
|
||||
**Cause**: An attribute value is not in the allowed enumeration.
|
||||
**Fix**: Use one of the valid values listed in the error.
|
||||
|
||||
### Unexpected Element
|
||||
|
||||
```
|
||||
ERROR: Element 'w:customTag' is not expected.
|
||||
Location: /word/document.xml, line 200
|
||||
```
|
||||
|
||||
**Cause**: An element not defined in the subset schema. May be a vendor extension.
|
||||
**Fix**: Check if it's a known extension (w14/w15/w16). If so, it's likely safe. If unknown, investigate or remove.
|
||||
|
||||
---
|
||||
|
||||
## Business Rules XSD
|
||||
|
||||
The `business-rules.xsd` schema enforces project-specific constraints beyond standard OpenXML validity:
|
||||
|
||||
| Rule | What It Checks |
|
||||
|------|---------------|
|
||||
| Required styles | `Normal`, `Heading1`-`Heading3`, `TableGrid` must exist in `styles.xml` |
|
||||
| Font consistency | `w:docDefaults` fonts match expected values |
|
||||
| Margin ranges | Page margins within acceptable range (720-2160 DXA) |
|
||||
| Page size | Must be A4 or Letter |
|
||||
| Heading hierarchy | No gaps (e.g., H1 → H3 without H2) |
|
||||
| Style chain | `w:basedOn` references must resolve to existing styles |
|
||||
|
||||
### Extending Business Rules
|
||||
|
||||
To add project-specific rules, add `xs:assert` or `xs:restriction` elements:
|
||||
|
||||
```xml
|
||||
<!-- Require minimum 1-inch margins -->
|
||||
<xs:element name="pgMar">
|
||||
<xs:complexType>
|
||||
<xs:attribute name="top" type="xs:integer">
|
||||
<xs:restriction>
|
||||
<xs:minInclusive value="1440" />
|
||||
</xs:restriction>
|
||||
</xs:attribute>
|
||||
</xs:complexType>
|
||||
</xs:element>
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Gate-Check: Scenario C Hard Gate
|
||||
|
||||
In Scenario C (Apply Template), the output document **MUST** pass `business-rules.xsd` validation before delivery:
|
||||
|
||||
```
|
||||
1. Apply template → output.docx
|
||||
2. Validate → dotnet run ... validate output.docx --xsd business-rules.xsd
|
||||
3. PASS? → Deliver to user
|
||||
4. FAIL? → Fix issues, re-validate, repeat until PASS
|
||||
```
|
||||
|
||||
**This is a hard gate.** A document that fails business-rules validation is NOT deliverable, even if it opens correctly in Word.
|
||||
|
||||
---
|
||||
|
||||
## False Positives
|
||||
|
||||
### Vendor Extensions
|
||||
|
||||
Elements from extended namespaces (`w14`, `w15`, `w16*`) are not in the subset schema and may trigger warnings:
|
||||
|
||||
```
|
||||
WARNING: Element '{http://schemas.microsoft.com/office/word/2010/wordml}shadow' is not expected.
|
||||
```
|
||||
|
||||
These are generally safe to ignore — they are Microsoft extensions for newer features (e.g., advanced text effects, comment extensions).
|
||||
|
||||
### Markup Compatibility
|
||||
|
||||
Documents may contain `mc:AlternateContent` blocks with fallback content. The subset schema may not recognize the `mc:` namespace processing. These are safe if the document opens correctly in Word.
|
||||
|
||||
### Recommended Approach
|
||||
|
||||
1. Run validation
|
||||
2. Treat **errors** as must-fix
|
||||
3. Review **warnings** — ignore known vendor extensions, investigate unknown elements
|
||||
4. After fixing errors, re-validate to confirm
|
||||
@@ -0,0 +1,40 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
echo "Usage: $(basename "$0") <file.doc> [output_directory]"
|
||||
echo "Convert .doc to .docx using LibreOffice."
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
INPUT="$1"
|
||||
OUTDIR="${2:-.}"
|
||||
|
||||
if [ ! -f "$INPUT" ]; then
|
||||
echo "Error: File not found: $INPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! command -v soffice &>/dev/null; then
|
||||
echo "Error: soffice (LibreOffice) is required for .doc conversion but not found."
|
||||
echo "Install LibreOffice: brew install --cask libreoffice"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
BASENAME=$(basename "$INPUT" .doc)
|
||||
mkdir -p "$OUTDIR"
|
||||
|
||||
echo "Converting: $INPUT -> $OUTDIR/$BASENAME.docx"
|
||||
soffice --headless --convert-to docx --outdir "$OUTDIR" "$INPUT" >/dev/null 2>&1
|
||||
|
||||
OUTPUT="$OUTDIR/$BASENAME.docx"
|
||||
if [ ! -f "$OUTPUT" ]; then
|
||||
echo "Error: Conversion failed. Output file not created: $OUTPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Success: $OUTPUT"
|
||||
@@ -0,0 +1,37 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
usage() {
|
||||
echo "Usage: $(basename "$0") <file.docx>"
|
||||
echo "Preview DOCX content as plain text."
|
||||
exit 1
|
||||
}
|
||||
|
||||
if [ $# -lt 1 ]; then
|
||||
usage
|
||||
fi
|
||||
|
||||
INPUT="$1"
|
||||
|
||||
if [ ! -f "$INPUT" ]; then
|
||||
echo "Error: File not found: $INPUT"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
FILE_SIZE=$(du -h "$INPUT" | cut -f1)
|
||||
echo "=== DOCX Preview: $(basename "$INPUT") ==="
|
||||
echo "File size: $FILE_SIZE"
|
||||
|
||||
if command -v pandoc &>/dev/null; then
|
||||
CONTENT=$(pandoc -f docx -t plain "$INPUT" 2>/dev/null)
|
||||
WORD_COUNT=$(echo "$CONTENT" | wc -w | tr -d ' ')
|
||||
EST_PAGES=$(( (WORD_COUNT + 249) / 250 ))
|
||||
echo "Word count: $WORD_COUNT"
|
||||
echo "Estimated pages: $EST_PAGES"
|
||||
echo "---"
|
||||
echo "$CONTENT"
|
||||
else
|
||||
echo "(pandoc not available, falling back to raw XML extract)"
|
||||
echo "---"
|
||||
unzip -p "$INPUT" word/document.xml 2>/dev/null | head -100
|
||||
fi
|
||||
+19
@@ -0,0 +1,19 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\MiniMaxAIDocx.Core\MiniMaxAIDocx.Core.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="System.CommandLine" Version="2.0.5" />
|
||||
</ItemGroup>
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFramework>net8.0</TargetFramework>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<Nullable>enable</Nullable>
|
||||
<NeutralLanguage>en</NeutralLanguage>
|
||||
</PropertyGroup>
|
||||
|
||||
</Project>
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
using System.CommandLine;
|
||||
using MiniMaxAIDocx.Core.Commands;
|
||||
|
||||
var rootCommand = new RootCommand("minimax-docx: OpenXML document generation and manipulation CLI");
|
||||
|
||||
// Scenario commands
|
||||
rootCommand.Add(CreateCommand.Create());
|
||||
rootCommand.Add(EditContentCommand.Create());
|
||||
rootCommand.Add(ApplyTemplateCommand.Create());
|
||||
|
||||
// Tool commands
|
||||
rootCommand.Add(ValidateCommand.Create());
|
||||
rootCommand.Add(MergeRunsCommand.Create());
|
||||
rootCommand.Add(FixOrderCommand.Create());
|
||||
rootCommand.Add(AnalyzeCommand.Create());
|
||||
rootCommand.Add(DiffCommand.Create());
|
||||
|
||||
return rootCommand.Parse(args).Invoke();
|
||||
+147
@@ -0,0 +1,147 @@
|
||||
using System.CommandLine;
|
||||
using System.IO.Compression;
|
||||
using System.Text.Json;
|
||||
using System.Xml.Linq;
|
||||
|
||||
namespace MiniMaxAIDocx.Core.Commands;
|
||||
|
||||
public static class AnalyzeCommand
|
||||
{
|
||||
private static readonly XNamespace W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
|
||||
private static readonly XNamespace WP = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing";
|
||||
|
||||
public static Command Create()
|
||||
{
|
||||
var inputOption = new Option<string>("--input") { Description = "DOCX file to analyze", Required = true };
|
||||
var jsonOption = new Option<bool>("--json") { Description = "Output as JSON" };
|
||||
|
||||
var cmd = new Command("analyze", "Analyze document structure and styles")
|
||||
{
|
||||
inputOption, jsonOption
|
||||
};
|
||||
|
||||
cmd.SetAction((parseResult) =>
|
||||
{
|
||||
var input = parseResult.GetValue(inputOption)!;
|
||||
var asJson = parseResult.GetValue(jsonOption);
|
||||
|
||||
if (!File.Exists(input))
|
||||
{
|
||||
Console.Error.WriteLine($"File not found: {input}");
|
||||
return;
|
||||
}
|
||||
|
||||
using var zip = ZipFile.OpenRead(input);
|
||||
var docEntry = zip.GetEntry("word/document.xml");
|
||||
if (docEntry == null)
|
||||
{
|
||||
Console.Error.WriteLine("Not a valid DOCX");
|
||||
return;
|
||||
}
|
||||
|
||||
XDocument doc;
|
||||
using (var stream = docEntry.Open())
|
||||
doc = XDocument.Load(stream);
|
||||
|
||||
var body = doc.Root?.Element(W + "body");
|
||||
if (body == null) return;
|
||||
|
||||
// Sections
|
||||
var sections = body.Descendants(W + "sectPr").ToList();
|
||||
var sectionBreaks = sections.Select(s => (string?)s.Element(W + "type")?.Attribute(W + "val") ?? "nextPage").ToList();
|
||||
|
||||
// Headings
|
||||
var headings = new List<object>();
|
||||
foreach (var p in body.Descendants(W + "p"))
|
||||
{
|
||||
var style = (string?)p.Element(W + "pPr")?.Element(W + "pStyle")?.Attribute(W + "val");
|
||||
if (style?.StartsWith("Heading", StringComparison.OrdinalIgnoreCase) == true)
|
||||
{
|
||||
var text = string.Concat(p.Descendants(W + "t").Select(t => t.Value));
|
||||
headings.Add(new { style, text });
|
||||
}
|
||||
}
|
||||
|
||||
// Tables
|
||||
var tables = body.Descendants(W + "tbl").Select(tbl => new
|
||||
{
|
||||
rows = tbl.Elements(W + "tr").Count(),
|
||||
cols = tbl.Elements(W + "tr").FirstOrDefault()?.Elements(W + "tc").Count() ?? 0
|
||||
}).ToList();
|
||||
|
||||
// Images
|
||||
var images = body.Descendants(W + "drawing").Count();
|
||||
|
||||
// Headers/footers
|
||||
var headerRefs = sections.SelectMany(s => s.Elements(W + "headerReference")).Count();
|
||||
var footerRefs = sections.SelectMany(s => s.Elements(W + "footerReference")).Count();
|
||||
|
||||
// Paragraphs and word count
|
||||
var paragraphs = body.Descendants(W + "p").ToList();
|
||||
var allText = string.Concat(body.Descendants(W + "t").Select(t => t.Value));
|
||||
var wordCount = allText.Split(new[] { ' ', '\t', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries).Length;
|
||||
|
||||
// XML file sizes
|
||||
var fileSizes = zip.Entries
|
||||
.Where(e => e.FullName.StartsWith("word/") && e.FullName.EndsWith(".xml"))
|
||||
.Select(e => new { file = e.FullName, size = e.Length })
|
||||
.OrderByDescending(e => e.size)
|
||||
.ToList();
|
||||
|
||||
// Styles
|
||||
var styleNames = new List<string>();
|
||||
var stylesEntry = zip.GetEntry("word/styles.xml");
|
||||
if (stylesEntry != null)
|
||||
{
|
||||
using var stream = stylesEntry.Open();
|
||||
var stylesDoc = XDocument.Load(stream);
|
||||
styleNames = stylesDoc.Descendants(W + "style")
|
||||
.Where(s => (string?)s.Attribute(W + "customStyle") == "1")
|
||||
.Select(s => (string?)s.Attribute(W + "styleId") ?? "")
|
||||
.Where(s => s != "")
|
||||
.ToList();
|
||||
}
|
||||
|
||||
var analysis = new
|
||||
{
|
||||
sections = new { count = sections.Count, breakTypes = sectionBreaks },
|
||||
headings,
|
||||
tables = new { count = tables.Count, details = tables },
|
||||
images,
|
||||
headerFooter = new { headers = headerRefs, footers = footerRefs },
|
||||
paragraphs = paragraphs.Count,
|
||||
estimatedWordCount = wordCount,
|
||||
xmlFileSizes = fileSizes,
|
||||
customStyles = new { count = styleNames.Count, names = styleNames }
|
||||
};
|
||||
|
||||
if (asJson)
|
||||
{
|
||||
Console.WriteLine(JsonSerializer.Serialize(analysis, new JsonSerializerOptions { WriteIndented = true }));
|
||||
}
|
||||
else
|
||||
{
|
||||
Console.WriteLine($"Sections: {sections.Count} ({string.Join(", ", sectionBreaks)})");
|
||||
Console.WriteLine($"Headings: {headings.Count}");
|
||||
foreach (var h in headings)
|
||||
Console.WriteLine($" {h}");
|
||||
Console.WriteLine($"Tables: {tables.Count}");
|
||||
foreach (var t in tables)
|
||||
Console.WriteLine($" {t.rows} rows x {t.cols} cols");
|
||||
Console.WriteLine($"Images: {images}");
|
||||
Console.WriteLine($"Headers: {headerRefs}");
|
||||
Console.WriteLine($"Footers: {footerRefs}");
|
||||
Console.WriteLine($"Paragraphs: {paragraphs.Count}");
|
||||
Console.WriteLine($"Word count: ~{wordCount}");
|
||||
Console.WriteLine($"Custom styles: {styleNames.Count}");
|
||||
foreach (var s in styleNames)
|
||||
Console.WriteLine($" {s}");
|
||||
Console.WriteLine("XML file sizes:");
|
||||
foreach (var f in fileSizes)
|
||||
Console.WriteLine($" {f.file}: {f.size:N0} bytes");
|
||||
}
|
||||
});
|
||||
|
||||
return cmd;
|
||||
}
|
||||
}
|
||||
+322
@@ -0,0 +1,322 @@
|
||||
using System.CommandLine;
|
||||
using DocumentFormat.OpenXml;
|
||||
using DocumentFormat.OpenXml.Packaging;
|
||||
using DocumentFormat.OpenXml.Wordprocessing;
|
||||
|
||||
namespace MiniMaxAIDocx.Core.Commands;
|
||||
|
||||
/// <summary>
|
||||
/// Scenario C: Apply formatting from a template DOCX to a source DOCX.
|
||||
/// Copies styles, theme, numbering, headers/footers, and section properties
|
||||
/// from the template while preserving all content from the source.
|
||||
/// </summary>
|
||||
public static class ApplyTemplateCommand
|
||||
{
|
||||
public static Command Create()
|
||||
{
|
||||
var inputOpt = new Option<string>("--input") { Description = "Source DOCX (content to keep)", Required = true };
|
||||
var templateOpt = new Option<string>("--template") { Description = "Template DOCX (formatting to apply)", Required = true };
|
||||
var outputOpt = new Option<string>("--output") { Description = "Output DOCX file path", Required = true };
|
||||
var applyStylesOpt = new Option<bool>("--apply-styles") { Description = "Copy styles.xml from template" };
|
||||
applyStylesOpt.DefaultValueFactory = _ => true;
|
||||
var applyThemeOpt = new Option<bool>("--apply-theme") { Description = "Copy theme from template" };
|
||||
applyThemeOpt.DefaultValueFactory = _ => true;
|
||||
var applyNumberingOpt = new Option<bool>("--apply-numbering") { Description = "Copy numbering.xml from template" };
|
||||
applyNumberingOpt.DefaultValueFactory = _ => true;
|
||||
var applyHeadersFootersOpt = new Option<bool>("--apply-headers-footers") { Description = "Copy headers/footers from template" };
|
||||
var applySectionsOpt = new Option<bool>("--apply-sections") { Description = "Apply section properties from template" };
|
||||
applySectionsOpt.DefaultValueFactory = _ => true;
|
||||
|
||||
var cmd = new Command("apply-template", "Apply template formatting to a DOCX")
|
||||
{
|
||||
inputOpt, templateOpt, outputOpt, applyStylesOpt, applyThemeOpt,
|
||||
applyNumberingOpt, applyHeadersFootersOpt, applySectionsOpt
|
||||
};
|
||||
|
||||
cmd.SetAction((parseResult) =>
|
||||
{
|
||||
var inputPath = parseResult.GetValue(inputOpt)!;
|
||||
var templatePath = parseResult.GetValue(templateOpt)!;
|
||||
var outputPath = parseResult.GetValue(outputOpt)!;
|
||||
var applyStyles = parseResult.GetValue(applyStylesOpt);
|
||||
var applyTheme = parseResult.GetValue(applyThemeOpt);
|
||||
var applyNumbering = parseResult.GetValue(applyNumberingOpt);
|
||||
var applyHeadersFooters = parseResult.GetValue(applyHeadersFootersOpt);
|
||||
var applySections = parseResult.GetValue(applySectionsOpt);
|
||||
|
||||
if (!File.Exists(inputPath)) { Console.Error.WriteLine($"Input file not found: {inputPath}"); return; }
|
||||
if (!File.Exists(templatePath)) { Console.Error.WriteLine($"Template file not found: {templatePath}"); return; }
|
||||
|
||||
// Create output as a copy of the source
|
||||
File.Copy(inputPath, outputPath, overwrite: true);
|
||||
|
||||
using var output = WordprocessingDocument.Open(outputPath, true);
|
||||
using var template = WordprocessingDocument.Open(templatePath, false);
|
||||
|
||||
var outputMain = output.MainDocumentPart;
|
||||
var templateMain = template.MainDocumentPart;
|
||||
if (outputMain == null || templateMain == null)
|
||||
{
|
||||
Console.Error.WriteLine("Invalid document: missing main document part.");
|
||||
return;
|
||||
}
|
||||
|
||||
int appliedCount = 0;
|
||||
|
||||
if (applyStyles)
|
||||
{
|
||||
CopyStyles(templateMain, outputMain);
|
||||
appliedCount++;
|
||||
Console.WriteLine(" Applied: styles");
|
||||
}
|
||||
|
||||
if (applyTheme)
|
||||
{
|
||||
CopyTheme(templateMain, outputMain);
|
||||
appliedCount++;
|
||||
Console.WriteLine(" Applied: theme");
|
||||
}
|
||||
|
||||
if (applyNumbering)
|
||||
{
|
||||
CopyNumbering(templateMain, outputMain);
|
||||
appliedCount++;
|
||||
Console.WriteLine(" Applied: numbering");
|
||||
}
|
||||
|
||||
if (applyHeadersFooters)
|
||||
{
|
||||
CopyHeadersAndFooters(templateMain, outputMain);
|
||||
appliedCount++;
|
||||
Console.WriteLine(" Applied: headers/footers");
|
||||
}
|
||||
|
||||
if (applySections)
|
||||
{
|
||||
CopySectionProperties(templateMain, outputMain);
|
||||
appliedCount++;
|
||||
Console.WriteLine(" Applied: section properties");
|
||||
}
|
||||
|
||||
outputMain.Document.Save();
|
||||
Console.WriteLine($"Applied {appliedCount} formatting component(s) from template to {outputPath}");
|
||||
});
|
||||
|
||||
return cmd;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Replaces the output's StyleDefinitionsPart with the template's version.
|
||||
/// </summary>
|
||||
private static void CopyStyles(MainDocumentPart template, MainDocumentPart output)
|
||||
{
|
||||
var templateStyles = template.StyleDefinitionsPart;
|
||||
if (templateStyles == null) return;
|
||||
|
||||
if (output.StyleDefinitionsPart != null)
|
||||
output.DeletePart(output.StyleDefinitionsPart);
|
||||
|
||||
var newStylesPart = output.AddNewPart<StyleDefinitionsPart>();
|
||||
|
||||
using var stream = templateStyles.GetStream(FileMode.Open, FileAccess.Read);
|
||||
newStylesPart.FeedData(stream);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Replaces the output's ThemePart with the template's version.
|
||||
/// </summary>
|
||||
private static void CopyTheme(MainDocumentPart template, MainDocumentPart output)
|
||||
{
|
||||
var templateTheme = template.ThemePart;
|
||||
if (templateTheme == null) return;
|
||||
|
||||
if (output.ThemePart != null)
|
||||
output.DeletePart(output.ThemePart);
|
||||
|
||||
var newThemePart = output.AddNewPart<ThemePart>();
|
||||
|
||||
using var stream = templateTheme.GetStream(FileMode.Open, FileAccess.Read);
|
||||
newThemePart.FeedData(stream);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Copies numbering definitions from template, remapping numbering IDs
|
||||
/// referenced in the output document's paragraphs.
|
||||
/// </summary>
|
||||
private static void CopyNumbering(MainDocumentPart template, MainDocumentPart output)
|
||||
{
|
||||
var templateNumbering = template.NumberingDefinitionsPart;
|
||||
if (templateNumbering == null) return;
|
||||
|
||||
var referencedNumIds = new HashSet<string>();
|
||||
var body = output.Document.Body;
|
||||
if (body != null)
|
||||
{
|
||||
foreach (var numId in body.Descendants<NumberingId>())
|
||||
{
|
||||
if (numId.Val?.Value != null)
|
||||
referencedNumIds.Add(numId.Val.Value.ToString());
|
||||
}
|
||||
}
|
||||
|
||||
if (output.NumberingDefinitionsPart != null)
|
||||
output.DeletePart(output.NumberingDefinitionsPart);
|
||||
|
||||
var newNumberingPart = output.AddNewPart<NumberingDefinitionsPart>();
|
||||
|
||||
using var stream = templateNumbering.GetStream(FileMode.Open, FileAccess.Read);
|
||||
newNumberingPart.FeedData(stream);
|
||||
|
||||
if (referencedNumIds.Count > 0)
|
||||
{
|
||||
Console.WriteLine($" Note: {referencedNumIds.Count} numbering reference(s) in document content mapped to template definitions.");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Copies headers and footers from the template, remapping relationship IDs.
|
||||
/// </summary>
|
||||
private static void CopyHeadersAndFooters(MainDocumentPart template, MainDocumentPart output)
|
||||
{
|
||||
var outputBody = output.Document.Body;
|
||||
if (outputBody == null) return;
|
||||
|
||||
// Remove existing header/footer parts from output
|
||||
foreach (var hp in output.HeaderParts.ToList())
|
||||
output.DeletePart(hp);
|
||||
foreach (var fp in output.FooterParts.ToList())
|
||||
output.DeletePart(fp);
|
||||
|
||||
// Remove existing header/footer references from all section properties
|
||||
foreach (var sectPr in outputBody.Descendants<SectionProperties>())
|
||||
{
|
||||
foreach (var hr in sectPr.Elements<HeaderReference>().ToList())
|
||||
hr.Remove();
|
||||
foreach (var fr in sectPr.Elements<FooterReference>().ToList())
|
||||
fr.Remove();
|
||||
}
|
||||
|
||||
var templateBody = template.Document?.Body;
|
||||
if (templateBody == null) return;
|
||||
|
||||
var templateFinalSectPr = templateBody.Descendants<SectionProperties>().LastOrDefault();
|
||||
if (templateFinalSectPr == null) return;
|
||||
|
||||
var outputFinalSectPr = outputBody.Descendants<SectionProperties>().LastOrDefault();
|
||||
if (outputFinalSectPr == null)
|
||||
{
|
||||
outputFinalSectPr = new SectionProperties();
|
||||
outputBody.Append(outputFinalSectPr);
|
||||
}
|
||||
|
||||
// Copy headers
|
||||
foreach (var headerRef in templateFinalSectPr.Elements<HeaderReference>())
|
||||
{
|
||||
var templateHeaderPart = template.GetPartById(headerRef.Id!) as HeaderPart;
|
||||
if (templateHeaderPart == null) continue;
|
||||
|
||||
var newHeaderPart = output.AddNewPart<HeaderPart>();
|
||||
using (var stream = templateHeaderPart.GetStream(FileMode.Open, FileAccess.Read))
|
||||
{
|
||||
newHeaderPart.FeedData(stream);
|
||||
}
|
||||
|
||||
CopyPartRelationships(templateHeaderPart, newHeaderPart);
|
||||
|
||||
var newRefId = output.GetIdOfPart(newHeaderPart);
|
||||
outputFinalSectPr.InsertAt(new HeaderReference
|
||||
{
|
||||
Type = headerRef.Type,
|
||||
Id = newRefId
|
||||
}, 0);
|
||||
}
|
||||
|
||||
// Copy footers
|
||||
foreach (var footerRef in templateFinalSectPr.Elements<FooterReference>())
|
||||
{
|
||||
var templateFooterPart = template.GetPartById(footerRef.Id!) as FooterPart;
|
||||
if (templateFooterPart == null) continue;
|
||||
|
||||
var newFooterPart = output.AddNewPart<FooterPart>();
|
||||
using (var stream = templateFooterPart.GetStream(FileMode.Open, FileAccess.Read))
|
||||
{
|
||||
newFooterPart.FeedData(stream);
|
||||
}
|
||||
|
||||
CopyPartRelationships(templateFooterPart, newFooterPart);
|
||||
|
||||
var newRefId = output.GetIdOfPart(newFooterPart);
|
||||
var lastHeaderRef = outputFinalSectPr.Elements<HeaderReference>().LastOrDefault();
|
||||
if (lastHeaderRef != null)
|
||||
lastHeaderRef.InsertAfterSelf(new FooterReference { Type = footerRef.Type, Id = newRefId });
|
||||
else
|
||||
outputFinalSectPr.InsertAt(new FooterReference { Type = footerRef.Type, Id = newRefId }, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Copies sub-relationships (images, etc.) from a source part to a target part.
|
||||
/// </summary>
|
||||
private static void CopyPartRelationships(OpenXmlPart source, OpenXmlPart target)
|
||||
{
|
||||
foreach (var rel in source.ExternalRelationships)
|
||||
{
|
||||
target.AddExternalRelationship(rel.RelationshipType, rel.Uri, rel.Id);
|
||||
}
|
||||
|
||||
foreach (var childPart in source.Parts)
|
||||
{
|
||||
try
|
||||
{
|
||||
var contentType = childPart.OpenXmlPart.ContentType;
|
||||
if (contentType.StartsWith("image/"))
|
||||
{
|
||||
var newChild = target.AddNewPart<ImagePart>(contentType, childPart.RelationshipId);
|
||||
using var stream = childPart.OpenXmlPart.GetStream(FileMode.Open, FileAccess.Read);
|
||||
newChild.FeedData(stream);
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.Error.WriteLine($"[WARN] Skipped non-image embedded part: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Copies page size, margins, columns, and document grid from template section properties.
|
||||
/// </summary>
|
||||
private static void CopySectionProperties(MainDocumentPart template, MainDocumentPart output)
|
||||
{
|
||||
var templateBody = template.Document?.Body;
|
||||
var outputBody = output.Document?.Body;
|
||||
if (templateBody == null || outputBody == null) return;
|
||||
|
||||
var templateSectPr = templateBody.Descendants<SectionProperties>().LastOrDefault();
|
||||
if (templateSectPr == null) return;
|
||||
|
||||
var outputSectPr = outputBody.Descendants<SectionProperties>().LastOrDefault();
|
||||
if (outputSectPr == null)
|
||||
{
|
||||
outputSectPr = new SectionProperties();
|
||||
outputBody.Append(outputSectPr);
|
||||
}
|
||||
|
||||
CopyChildElement<PageSize>(templateSectPr, outputSectPr);
|
||||
CopyChildElement<PageMargin>(templateSectPr, outputSectPr);
|
||||
CopyChildElement<Columns>(templateSectPr, outputSectPr);
|
||||
CopyChildElement<DocGrid>(templateSectPr, outputSectPr);
|
||||
CopyChildElement<PageBorders>(templateSectPr, outputSectPr);
|
||||
}
|
||||
|
||||
private static void CopyChildElement<T>(SectionProperties source, SectionProperties target) where T : OpenXmlElement
|
||||
{
|
||||
var sourceElement = source.GetFirstChild<T>();
|
||||
if (sourceElement == null) return;
|
||||
|
||||
var existing = target.GetFirstChild<T>();
|
||||
existing?.Remove();
|
||||
|
||||
target.Append((T)sourceElement.CloneNode(true));
|
||||
}
|
||||
}
|
||||
+324
@@ -0,0 +1,324 @@
|
||||
using System.CommandLine;
|
||||
using DocumentFormat.OpenXml;
|
||||
using DocumentFormat.OpenXml.Packaging;
|
||||
using DocumentFormat.OpenXml.Wordprocessing;
|
||||
using MiniMaxAIDocx.Core.OpenXml;
|
||||
using MiniMaxAIDocx.Core.Typography;
|
||||
|
||||
namespace MiniMaxAIDocx.Core.Commands;
|
||||
|
||||
/// <summary>
|
||||
/// Scenario A: Create a new DOCX document from scratch with proper styles, sections,
|
||||
/// headers/footers, and typography defaults.
|
||||
/// </summary>
|
||||
public static class CreateCommand
|
||||
{
|
||||
public static Command Create()
|
||||
{
|
||||
var outputOption = new Option<string>("--output") { Description = "Output DOCX file path", Required = true };
|
||||
var typeOption = new Option<string>("--type") { Description = "Document type: report, letter, memo, academic" };
|
||||
typeOption.DefaultValueFactory = _ => "report";
|
||||
var titleOption = new Option<string>("--title") { Description = "Document title" };
|
||||
var authorOption = new Option<string>("--author") { Description = "Document author" };
|
||||
var pageSizeOption = new Option<string>("--page-size") { Description = "Page size: letter, a4, legal, a3" };
|
||||
pageSizeOption.DefaultValueFactory = _ => "letter";
|
||||
var marginsOption = new Option<string>("--margins") { Description = "Margin preset: standard, narrow, wide" };
|
||||
marginsOption.DefaultValueFactory = _ => "standard";
|
||||
var headerTextOption = new Option<string>("--header") { Description = "Header text" };
|
||||
var footerTextOption = new Option<string>("--footer") { Description = "Footer text" };
|
||||
var pageNumbersOption = new Option<bool>("--page-numbers") { Description = "Add page numbers in footer" };
|
||||
var tocOption = new Option<bool>("--toc") { Description = "Insert table of contents placeholder" };
|
||||
var contentJsonOption = new Option<string>("--content-json") { Description = "Path to JSON file describing document content" };
|
||||
|
||||
var cmd = new Command("create", "Create a new DOCX document from scratch")
|
||||
{
|
||||
outputOption, typeOption, titleOption, authorOption, pageSizeOption,
|
||||
marginsOption, headerTextOption, footerTextOption, pageNumbersOption,
|
||||
tocOption, contentJsonOption
|
||||
};
|
||||
|
||||
cmd.SetAction((parseResult) =>
|
||||
{
|
||||
var output = parseResult.GetValue(outputOption)!;
|
||||
var docType = parseResult.GetValue(typeOption) ?? "report";
|
||||
var title = parseResult.GetValue(titleOption);
|
||||
var author = parseResult.GetValue(authorOption);
|
||||
var pageSizeName = parseResult.GetValue(pageSizeOption) ?? "letter";
|
||||
var marginsName = parseResult.GetValue(marginsOption) ?? "standard";
|
||||
var headerText = parseResult.GetValue(headerTextOption);
|
||||
var footerText = parseResult.GetValue(footerTextOption);
|
||||
var pageNumbers = parseResult.GetValue(pageNumbersOption);
|
||||
var tocPlaceholder = parseResult.GetValue(tocOption);
|
||||
var contentJson = parseResult.GetValue(contentJsonOption);
|
||||
|
||||
var fontConfig = GetFontConfig(docType);
|
||||
var pageSize = GetPageSizeConfig(pageSizeName);
|
||||
var margins = GetMargins(marginsName);
|
||||
|
||||
using var doc = WordprocessingDocument.Create(output, WordprocessingDocumentType.Document);
|
||||
var mainPart = doc.AddMainDocumentPart();
|
||||
mainPart.Document = new Document(new Body());
|
||||
var body = mainPart.Document.Body!;
|
||||
|
||||
// Add styles part with defaults
|
||||
AddDefaultStyles(mainPart, fontConfig);
|
||||
|
||||
// Add section properties (page size, margins)
|
||||
var sectPr = new SectionProperties();
|
||||
sectPr.Append(new DocumentFormat.OpenXml.Wordprocessing.PageSize
|
||||
{
|
||||
Width = (UInt32Value)(uint)pageSize.WidthDxa,
|
||||
Height = (UInt32Value)(uint)pageSize.HeightDxa
|
||||
});
|
||||
sectPr.Append(new PageMargin
|
||||
{
|
||||
Top = margins.TopDxa,
|
||||
Bottom = margins.BottomDxa,
|
||||
Left = (UInt32Value)(uint)margins.LeftDxa,
|
||||
Right = (UInt32Value)(uint)margins.RightDxa
|
||||
});
|
||||
|
||||
// Add header if requested
|
||||
if (!string.IsNullOrEmpty(headerText))
|
||||
{
|
||||
var headerPart = mainPart.AddNewPart<HeaderPart>();
|
||||
headerPart.Header = new Header(
|
||||
new Paragraph(new Run(new Text(headerText))));
|
||||
var headerRefId = mainPart.GetIdOfPart(headerPart);
|
||||
sectPr.Append(new HeaderReference
|
||||
{
|
||||
Type = HeaderFooterValues.Default,
|
||||
Id = headerRefId
|
||||
});
|
||||
}
|
||||
|
||||
// Add footer if requested
|
||||
if (!string.IsNullOrEmpty(footerText) || pageNumbers)
|
||||
{
|
||||
var footerPart = mainPart.AddNewPart<FooterPart>();
|
||||
var footerParagraph = new Paragraph();
|
||||
|
||||
if (!string.IsNullOrEmpty(footerText))
|
||||
{
|
||||
footerParagraph.Append(new Run(new Text(footerText)));
|
||||
}
|
||||
|
||||
if (pageNumbers)
|
||||
{
|
||||
if (!string.IsNullOrEmpty(footerText))
|
||||
footerParagraph.Append(new Run(new Text(" — ") { Space = SpaceProcessingModeValues.Preserve }));
|
||||
|
||||
footerParagraph.Append(new Run(
|
||||
new FieldChar { FieldCharType = FieldCharValues.Begin }));
|
||||
footerParagraph.Append(new Run(
|
||||
new FieldCode(" PAGE ") { Space = SpaceProcessingModeValues.Preserve }));
|
||||
footerParagraph.Append(new Run(
|
||||
new FieldChar { FieldCharType = FieldCharValues.End }));
|
||||
}
|
||||
|
||||
footerPart.Footer = new Footer(footerParagraph);
|
||||
var footerRefId = mainPart.GetIdOfPart(footerPart);
|
||||
sectPr.Append(new FooterReference
|
||||
{
|
||||
Type = HeaderFooterValues.Default,
|
||||
Id = footerRefId
|
||||
});
|
||||
}
|
||||
|
||||
// Title
|
||||
if (!string.IsNullOrEmpty(title))
|
||||
{
|
||||
var titlePara = new Paragraph(
|
||||
new ParagraphProperties(new ParagraphStyleId { Val = "Title" }),
|
||||
new Run(new Text(title)));
|
||||
body.Append(titlePara);
|
||||
}
|
||||
|
||||
// Author subtitle
|
||||
if (!string.IsNullOrEmpty(author))
|
||||
{
|
||||
var authorPara = new Paragraph(
|
||||
new ParagraphProperties(new ParagraphStyleId { Val = "Subtitle" }),
|
||||
new Run(new Text(author)));
|
||||
body.Append(authorPara);
|
||||
}
|
||||
|
||||
// TOC placeholder
|
||||
if (tocPlaceholder)
|
||||
{
|
||||
body.Append(new Paragraph(
|
||||
new ParagraphProperties(new ParagraphStyleId { Val = "TOCHeading" }),
|
||||
new Run(new Text("Table of Contents"))));
|
||||
|
||||
// Insert TOC field
|
||||
var tocPara = new Paragraph();
|
||||
tocPara.Append(new Run(new FieldChar { FieldCharType = FieldCharValues.Begin }));
|
||||
tocPara.Append(new Run(new FieldCode(" TOC \\o \"1-3\" \\h \\z \\u ") { Space = SpaceProcessingModeValues.Preserve }));
|
||||
tocPara.Append(new Run(new FieldChar { FieldCharType = FieldCharValues.Separate }));
|
||||
tocPara.Append(new Run(new Text("Update this field to generate table of contents.")));
|
||||
tocPara.Append(new Run(new FieldChar { FieldCharType = FieldCharValues.End }));
|
||||
body.Append(tocPara);
|
||||
|
||||
// Page break after TOC
|
||||
body.Append(new Paragraph(new Run(new Break { Type = BreakValues.Page })));
|
||||
}
|
||||
|
||||
// Content from JSON (if provided)
|
||||
if (!string.IsNullOrEmpty(contentJson) && File.Exists(contentJson))
|
||||
{
|
||||
var jsonContent = File.ReadAllText(contentJson);
|
||||
AddContentFromJson(body, jsonContent, fontConfig);
|
||||
}
|
||||
|
||||
// Ensure body has at least one paragraph
|
||||
if (!body.Elements<Paragraph>().Any())
|
||||
{
|
||||
body.Append(new Paragraph());
|
||||
}
|
||||
|
||||
// sectPr must be the last child of body
|
||||
body.Append(sectPr);
|
||||
|
||||
mainPart.Document.Save();
|
||||
Console.WriteLine($"Created {docType} document: {output}");
|
||||
});
|
||||
|
||||
return cmd;
|
||||
}
|
||||
|
||||
private static FontConfig GetFontConfig(string docType) => docType.ToLowerInvariant() switch
|
||||
{
|
||||
"letter" => FontDefaults.Letter,
|
||||
"memo" => FontDefaults.Memo,
|
||||
"academic" => FontDefaults.Academic,
|
||||
_ => FontDefaults.Report,
|
||||
};
|
||||
|
||||
private static Typography.PageSize GetPageSizeConfig(string name) => name.ToLowerInvariant() switch
|
||||
{
|
||||
"a4" => PageSizes.A4,
|
||||
"legal" => PageSizes.Legal,
|
||||
"a3" => PageSizes.A3,
|
||||
_ => PageSizes.Letter,
|
||||
};
|
||||
|
||||
private static MarginConfig GetMargins(string name) => name.ToLowerInvariant() switch
|
||||
{
|
||||
"narrow" => PageSizes.NarrowMargins,
|
||||
"wide" => PageSizes.WideMargins,
|
||||
_ => PageSizes.StandardMargins,
|
||||
};
|
||||
|
||||
private static void AddDefaultStyles(MainDocumentPart mainPart, FontConfig fontConfig)
|
||||
{
|
||||
var stylesPart = mainPart.AddNewPart<StyleDefinitionsPart>();
|
||||
var styles = new Styles();
|
||||
|
||||
// Default run properties
|
||||
var defaultRPr = new StyleRunProperties(
|
||||
new RunFonts { Ascii = fontConfig.BodyFont, HighAnsi = fontConfig.BodyFont },
|
||||
new FontSize { Val = UnitConverter.FontSizeToSz(fontConfig.BodySize) },
|
||||
new FontSizeComplexScript { Val = UnitConverter.FontSizeToSz(fontConfig.BodySize) });
|
||||
|
||||
// Normal style
|
||||
styles.Append(new Style(
|
||||
new StyleName { Val = "Normal" },
|
||||
new PrimaryStyle(),
|
||||
defaultRPr)
|
||||
{ Type = StyleValues.Paragraph, StyleId = "Normal", Default = true });
|
||||
|
||||
// Heading styles 1-6
|
||||
double[] headingSizes = [fontConfig.Heading1Size, fontConfig.Heading2Size, fontConfig.Heading3Size,
|
||||
fontConfig.Heading4Size, fontConfig.Heading5Size, fontConfig.Heading6Size];
|
||||
for (int i = 0; i < 6; i++)
|
||||
{
|
||||
var level = i + 1;
|
||||
var headingStyle = new Style(
|
||||
new StyleName { Val = $"heading {level}" },
|
||||
new BasedOn { Val = "Normal" },
|
||||
new NextParagraphStyle { Val = "Normal" },
|
||||
new PrimaryStyle(),
|
||||
new StyleParagraphProperties(
|
||||
new KeepNext(),
|
||||
new KeepLines(),
|
||||
new SpacingBetweenLines { Before = "240", After = "120" },
|
||||
new OutlineLevel { Val = i }),
|
||||
new StyleRunProperties(
|
||||
new RunFonts { Ascii = fontConfig.HeadingFont, HighAnsi = fontConfig.HeadingFont },
|
||||
new FontSize { Val = UnitConverter.FontSizeToSz(headingSizes[i]) },
|
||||
new FontSizeComplexScript { Val = UnitConverter.FontSizeToSz(headingSizes[i]) },
|
||||
new Bold()))
|
||||
{ Type = StyleValues.Paragraph, StyleId = $"Heading{level}" };
|
||||
styles.Append(headingStyle);
|
||||
}
|
||||
|
||||
// Title style
|
||||
styles.Append(new Style(
|
||||
new StyleName { Val = "Title" },
|
||||
new BasedOn { Val = "Normal" },
|
||||
new NextParagraphStyle { Val = "Normal" },
|
||||
new PrimaryStyle(),
|
||||
new StyleParagraphProperties(
|
||||
new Justification { Val = JustificationValues.Center },
|
||||
new SpacingBetweenLines { After = "300" }),
|
||||
new StyleRunProperties(
|
||||
new RunFonts { Ascii = fontConfig.HeadingFont, HighAnsi = fontConfig.HeadingFont },
|
||||
new FontSize { Val = UnitConverter.FontSizeToSz(fontConfig.Heading1Size + 6) },
|
||||
new FontSizeComplexScript { Val = UnitConverter.FontSizeToSz(fontConfig.Heading1Size + 6) }))
|
||||
{ Type = StyleValues.Paragraph, StyleId = "Title" });
|
||||
|
||||
// Subtitle style
|
||||
styles.Append(new Style(
|
||||
new StyleName { Val = "Subtitle" },
|
||||
new BasedOn { Val = "Normal" },
|
||||
new NextParagraphStyle { Val = "Normal" },
|
||||
new StyleParagraphProperties(
|
||||
new Justification { Val = JustificationValues.Center },
|
||||
new SpacingBetweenLines { After = "200" }),
|
||||
new StyleRunProperties(
|
||||
new Color { Val = "5A5A5A" },
|
||||
new FontSize { Val = UnitConverter.FontSizeToSz(fontConfig.BodySize + 2) }))
|
||||
{ Type = StyleValues.Paragraph, StyleId = "Subtitle" });
|
||||
|
||||
stylesPart.Styles = styles;
|
||||
stylesPart.Styles.Save();
|
||||
}
|
||||
|
||||
private static void AddContentFromJson(Body body, string jsonContent, FontConfig fontConfig)
|
||||
{
|
||||
// Simple JSON content format: array of {type, text, level?}
|
||||
// e.g. [{"type":"heading","text":"Introduction","level":1},{"type":"paragraph","text":"..."}]
|
||||
try
|
||||
{
|
||||
using var jsonDoc = System.Text.Json.JsonDocument.Parse(jsonContent);
|
||||
foreach (var element in jsonDoc.RootElement.EnumerateArray())
|
||||
{
|
||||
var type = element.GetProperty("type").GetString() ?? "paragraph";
|
||||
var text = element.GetProperty("text").GetString() ?? "";
|
||||
|
||||
switch (type)
|
||||
{
|
||||
case "heading":
|
||||
var level = element.TryGetProperty("level", out var lvl) ? lvl.GetInt32() : 1;
|
||||
level = Math.Clamp(level, 1, 6);
|
||||
body.Append(new Paragraph(
|
||||
new ParagraphProperties(new ParagraphStyleId { Val = $"Heading{level}" }),
|
||||
new Run(new Text(text))));
|
||||
break;
|
||||
|
||||
case "paragraph":
|
||||
body.Append(new Paragraph(new Run(new Text(text))));
|
||||
break;
|
||||
|
||||
case "pagebreak":
|
||||
body.Append(new Paragraph(new Run(new Break { Type = BreakValues.Page })));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (System.Text.Json.JsonException ex)
|
||||
{
|
||||
Console.Error.WriteLine($"Warning: could not parse content JSON: {ex.Message}");
|
||||
}
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user