/** * GatewayManager — 多 Profile 网关生命周期管理 * * 核心职责: * 1. 启动时检测所有 profile 的网关运行状态(PID、端口、健康检查) * 2. 自动发现端口冲突并重新分配 * 3. 启动/停止网关进程 * * 启动检测流程(detectStatus): * ① 读取 gateway.pid → 获取 PID * ② 读取 config.yaml (platforms.api_server.extra.port/host) → 获取配置端口 * ③ PID 存活? * - 否 → 标记为 stopped * - 是 → 继续 * ④ 对配置端口做 health check? * - 通过 → 配置与运行状态匹配,注册网关 * - 失败 → 用 lsof 查 PID 实际监听端口 * ⑤ 实际端口 ≠ 配置端口? * - 是 → 更新 config.yaml 到实际端口,重新 health check,通过则注册 * - 否 → 标记为 stopped * * 端口分配流程(resolvePort,启动前调用): * ① 读取配置端口 * ② 检查是否被已管理的网关占用 * ③ 检查是否被外部系统进程占用(TCP bind 测试) * ④ 冲突则从 base+1 递增找空闲端口,并写入 config.yaml * * 启动模式: * - 正常系统(macOS/Linux):hermes gateway start/stop(系统服务管理) * - WSL / Docker:hermes gateway run(detached 子进程,手动 kill) */ import { spawn, execSync, type ChildProcess } from 'child_process' import { resolve, join } from 'path' import { homedir } from 'os' import { readFileSync, writeFileSync, existsSync, readdirSync, unlinkSync } from 'fs' import { execFile } from 'child_process' import { promisify } from 'util' import { createServer } from 'net' import yaml from 'js-yaml' import { logger } from '../logger' import { detectHermesHome, getHermesBin } from './hermes-path' const execFileAsync = promisify(execFile) // ============================ // 常量 & 环境检测 // ============================ const HERMES_BASE = detectHermesHome() const HERMES_BIN = getHermesBin() /** * 检测系统的 init 系统(服务管理器) * - macOS → launchd * - Windows → windows-service * - Linux → systemd / sysvinit / other * * 没有 systemd/launchd/windows-service 的环境需要用 "gateway run" 代替 "gateway start" * (适用于 WSL/Docker/Termux/proot 等无服务管理器的环境) */ function detectInitSystem(): string { const platform = process.platform // macOS → launchd if (platform === 'darwin') { return 'launchd' } // Windows → Service Manager if (platform === 'win32') { return 'windows-service' } // Linux 才检查 /proc if (platform === 'linux') { try { if (existsSync('/.dockerenv') || existsSync('/run/.containerenv')) { return 'container' } const comm = readFileSync('/proc/1/comm', 'utf-8').trim() if (comm === 'systemd') { return existsSync('/run/systemd/system') ? 'systemd' : 'other' } if (comm === 'init') return 'sysvinit' return 'other' } catch { return 'unknown' } } return 'unknown' } // 注意:虽然此函数仍然存在,但当前所有平台都统一使用 run 模式 // 保留此函数是为了将来如果需要切换回 start/stop 模式时可以参考 const initSystem = detectInitSystem() /** * 所有平台统一使用 run 模式 * run 模式会自动处理锁定文件冲突(--replace 标志),更可靠 * 子进程跟随父进程生命周期,父进程关闭时子进程自动关闭 */ const needsRunMode = true // 启动时输出 init 系统检测结果(方便调试) logger.debug('Detected init system: %s (needsRunMode: %s, platform: %s)', initSystem, needsRunMode, process.platform) // ============================ // 类型定义 // ============================ export interface GatewayStatus { profile: string port: number host: string url: string running: boolean pid?: number } interface ManagedGateway { pid: number port: number host: string url: string owned: boolean process?: ChildProcess } interface ResolvedGatewayEndpoint { port: number host: string } function formatHostForUrl(host: string): string { if (host.startsWith('[') && host.endsWith(']')) return host return host.includes(':') ? `[${host}]` : host } function buildHttpUrl(host: string, port: number): string { return `http://${formatHostForUrl(host)}:${port}` } function isLocalHost(host: string): boolean { return ['127.0.0.1', 'localhost', '::1', '[::1]', '0.0.0.0'].includes(host) } function shouldDetachGatewayProcess(): boolean { // In dev mode (nodemon), always detach gateway processes so they survive restarts // Production mode: attach gateways so they can be managed together with the server const override = process.env.HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN?.trim().toLowerCase() const shouldDetach = override === '0' || override === 'false' if (shouldDetach) { console.log('[gateway] Detaching gateway process (dev mode: HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=' + override + ')') } else { console.log('[gateway] Attaching gateway process (prod mode: HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=' + (override || 'not set') + ')') } return shouldDetach } // ============================ // GatewayManager // ============================ export class GatewayManager { /** 已注册的网关:profile name → { pid, port, host, url } */ private gateways = new Map() /** 本次启动过程中已分配的端口集合(防止并发分配到相同端口) */ private allocatedPorts = new Set() /** 当前活跃的 profile(用于代理路由的默认上游) */ private activeProfile: string constructor(activeProfile: string) { this.activeProfile = activeProfile } // ============================ // Profile 目录 & 配置读取 // ============================ /** 获取 profile 的 home 目录路径 */ private profileDir(name: string): string { if (name === 'default') return HERMES_BASE return join(HERMES_BASE, 'profiles', name) } /** * 从 profile 的 config.yaml 读取 api_server 端口和主机 * 读取路径:platforms.api_server.extra.port / extra.host */ private readProfilePort(name: string): { port: number; host: string } { const configPath = join(this.profileDir(name), 'config.yaml') const defaultHost = process.env.GATEWAY_HOST || '127.0.0.1' if (!existsSync(configPath)) return { port: 8642, host: defaultHost } try { const content = readFileSync(configPath, 'utf-8') const cfg = yaml.load(content, { json: true }) as any || {} const extra = cfg?.platforms?.api_server?.extra const rawPort = extra?.port || 8642 const port = typeof rawPort === 'number' ? rawPort : parseInt(rawPort, 10) || 8642 const host = extra?.host || defaultHost // 端口超出合法范围时回退到默认值 return { port: port > 0 && port <= 65535 ? port : 8642, host } } catch { return { port: 8642, host: defaultHost } } } /** Read a profile gateway PID, falling back to runtime state when gateway.pid is missing. */ private readPidFile(name: string): number | null { const profilePath = this.profileDir(name) const pidPath = join(profilePath, 'gateway.pid') try { if (existsSync(pidPath)) { const content = readFileSync(pidPath, 'utf-8').trim() const data = JSON.parse(content) return typeof data.pid === 'number' ? data.pid : parseInt(data.pid, 10) || null } } catch {} const statePath = join(profilePath, 'gateway_state.json') if (!existsSync(statePath)) return null try { const content = readFileSync(statePath, 'utf-8').trim() const data = JSON.parse(content) const pid = typeof data.pid === 'number' ? data.pid : parseInt(data.pid, 10) || null const state = data?.gateway_state return pid && Number.isFinite(pid) && pid > 0 && (state === 'running' || state === 'starting') ? pid : null } catch { return null } } // ============================ // 进程 & 端口检测工具 // ============================ /** Check process liveness without sending a terminating signal. */ private isProcessAlive(pid: number): boolean { try { process.kill(pid, 0) return true } catch (err: any) { return err?.code === 'EPERM' } } /** 请求 /health 端点,判断网关是否真正就绪 */ private async checkHealth(url: string, timeoutMs = 3000): Promise { try { const res = await fetch(`${url.replace(/\/$/, '')}/health`, { signal: AbortSignal.timeout(timeoutMs), }) return res.ok } catch { return false } } /** 尝试绑定端口,检测端口是否被系统级进程占用 */ /** 清理过期的 PID 文件 */ private clearPidFile(name: string): void { try { const pidPath = join(this.profileDir(name), 'gateway.pid') if (existsSync(pidPath)) { unlinkSync(pidPath) logger.debug('Cleared stale PID file for profile "%s"', name) } } catch (err) { logger.debug('Failed to clear PID file: %s', err) } } /** 从 base 端口开始递增查找空闲端口(上限 65535) */ private findFreePort(base: number, host = '127.0.0.1', reservedPorts = new Set()): Promise { return new Promise((resolve, reject) => { const tryPort = (port: number) => { if (port > 65535) { reject(new Error(`No free port found in range ${base}-65535`)) return } if (reservedPorts.has(port)) { tryPort(port + 1) return } const server = createServer() server.once('error', () => { server.close() tryPort(port + 1) }) server.once('listening', () => { server.close() resolve(port) }) server.listen(port, host) } tryPort(base) }) } // ============================ // 配置写入 // ============================ /** * 将端口和主机写入 profile 的 config.yaml * 写入完整结构: * platforms: * api_server: * enabled: true * key: '' * cors_origins: '*' * extra: * port: * host: * 同时清理旧的顶层 port/host(避免 Hermes 读取错误) */ private writeProfilePort(name: string, port: number, host: string): void { const configPath = join(this.profileDir(name), 'config.yaml') try { const content = existsSync(configPath) ? readFileSync(configPath, 'utf-8') : '' const cfg = (yaml.load(content, { json: true }) as any) || {} // 确保 platforms.api_server 结构存在(不会影响其他位置的 platforms) if (!cfg.platforms) cfg.platforms = {} if (!cfg.platforms.api_server) cfg.platforms.api_server = {} if (!cfg.platforms.api_server.extra) cfg.platforms.api_server.extra = {} cfg.platforms.api_server.enabled = true cfg.platforms.api_server.key = '' cfg.platforms.api_server.cors_origins = '*' cfg.platforms.api_server.extra.port = port cfg.platforms.api_server.extra.host = host // 清理旧的顶层 port/host,Hermes 只从 extra 读取 if (cfg.platforms.api_server.port !== undefined) { delete cfg.platforms.api_server.port } if (cfg.platforms.api_server.host !== undefined) { delete cfg.platforms.api_server.host } writeFileSync(configPath, yaml.dump(cfg, { lineWidth: -1 }), 'utf-8') logger.debug('Updated %s: api_server.extra.port = %d', configPath, port) } catch (err) { logger.error(err, 'Failed to write config for profile "%s"', name) } } // ============================ // 端口分配 // ============================ /** * 为 profile 分配可用端口(启动前调用) * * 检测顺序: * 1. 当前 profile 已经健康运行 → 直接使用运行端口 * 2. 未运行 → 从 8642 开始找空闲端口 * 3. 检查已管理 profile / 本轮已分配端口 / 系统 TCP 占用 * 4. 先写入 config.yaml,再启动 gateway */ private async resolvePort(name: string): Promise<{ port: number; host: string }> { const { port: configuredPort, host } = this.readProfilePort(name) const configuredUrl = buildHttpUrl(host, configuredPort) // 检查是否是当前 profile 自己的端口(内存中的记录) const existing = this.gateways.get(name) if (existing && existing.host === host && this.isProcessAlive(existing.pid) && await this.checkHealth(existing.url, 1000)) { // 如果内存中有记录且进程存活,直接使用内存中的端口 logger.info('Profile "%s" already running on port %d (in-memory record)', name, existing.port) this.allocatedPorts.add(existing.port) return { port: existing.port, host } } // 检查 PID 文件指向的当前 profile 是否仍健康运行 const pid = this.readPidFile(name) if (pid && this.isProcessAlive(pid) && await this.checkHealth(configuredUrl, 1000)) { logger.info('Profile "%s" already running on configured port %d (PID: %d)', name, configuredPort, pid) this.gateways.set(name, { pid, port: configuredPort, host, url: configuredUrl, owned: false }) this.allocatedPorts.add(configuredPort) return { port: configuredPort, host } } // 如果没有 PID 文件也没有内存记录,不认领端口上的未知网关 // 如果端口被占用,findFreePort 会分配新端口 // 收集已占用端口:本次启动已分配的端口 + 其他 profile 的网关端口 const usedPorts = new Set(this.allocatedPorts) for (const [profileName, gw] of Array.from(this.gateways.entries())) { // 跳过当前 profile 自己的端口 if (profileName === name) continue if (gw.host === host && this.isProcessAlive(gw.pid)) { usedPorts.add(gw.port) } } const port = await this.findFreePort(8642, host, usedPorts) if (configuredPort !== port) { logger.info('Assigning port for profile "%s": %d → %d', name, configuredPort, port) } else { logger.debug('Assigning port %d for profile "%s"', port, name) } this.writeProfilePort(name, port, host) this.allocatedPorts.add(port) return { port, host } } // ============================ // 公开方法:状态查询 // ============================ /** 获取指定 profile 的网关 URL(代理路由使用) */ getUpstream(profileName?: string): string { const name = profileName || this.activeProfile const gw = this.gateways.get(name) if (gw?.url) return gw.url const { port, host } = this.readProfilePort(name) return buildHttpUrl(host, port) } /** 读取 profile 的 API_SERVER_KEY(从 .env 文件) */ getApiKey(profileName?: string): string | null { const name = profileName || this.activeProfile try { const envPath = join(this.profileDir(name), '.env') if (!existsSync(envPath)) return null const content = readFileSync(envPath, 'utf-8') const match = content.match(/^API_SERVER_KEY\s*=\s*"?([^"\n]+)"?/m) return match?.[1]?.trim() || null } catch { return null } } getActiveProfile(): string { return this.activeProfile } setActiveProfile(name: string) { this.activeProfile = name } /** 列出所有已知 profile 名称(通过 hermes CLI 或文件系统扫描) */ async listProfiles(): Promise { try { const { stdout } = await execFileAsync(HERMES_BIN, ['profile', 'list'], { timeout: 10000, windowsHide: true, }) const profiles: string[] = [] for (const line of stdout.trim().split('\n')) { if (line.startsWith(' Profile') || line.match(/^ ─/)) continue const match = line.match(/^\s+(?:◆)?(.+?)\s+/) if (match) profiles.push(match[1]) } return profiles } catch { // CLI 不可用时回退到文件系统扫描 const profiles = ['default'] const profilesDir = join(HERMES_BASE, 'profiles') if (existsSync(profilesDir)) { for (const entry of readdirSync(profilesDir, { withFileTypes: true })) { if (entry.isDirectory() && existsSync(join(profilesDir, entry.name, 'config.yaml'))) { profiles.push(entry.name) } } } return profiles } } /** * 检测单个 profile 的网关状态(只读,不修改任何进程或配置) * * 流程: * ① 读 PID 文件 → 检查进程是否存活 * ② 读配置端口 → health check * ③ 两者都通过 → 匹配,注册 * ④ 否则 → 标记为未运行(不杀进程,由 startAll 处理) */ async detectStatus(name: string): Promise { const pid = this.readPidFile(name) const { port, host } = this.readProfilePort(name) const url = buildHttpUrl(host, port) // 首先检查 PID 文件:如果存在且进程存活且健康,则标记为运行 if (pid && this.isProcessAlive(pid) && await this.checkHealth(url)) { this.gateways.set(name, { pid, port, host, url, owned: false }) return { profile: name, port, host, url, running: true, pid } } // 没有 PID 文件时不认领端口上的未知网关,避免误判其他 profile 的网关 this.gateways.delete(name) return { profile: name, port, host, url, running: false } } /** 检测所有 profile 的网关状态 */ async listAll(): Promise { const profiles = await this.listProfiles() const statuses = await Promise.all(profiles.map(name => this.detectStatus(name))) return statuses } // ============================ // 公开方法:启动 & 停止 // ============================ /** * 启动单个 profile 的网关 * 启动前自动调用 resolvePort() 确保端口可用且配置完整 */ async start(name: string): Promise { // 检查是否已在运行 const existing = this.gateways.get(name) if (existing && this.isProcessAlive(existing.pid)) { if (await this.checkHealth(existing.url, 1000)) { logger.info('Gateway for profile "%s" already running (PID: %d, port: %d)', name, existing.pid, existing.port) return { profile: name, port: existing.port, host: existing.host, url: existing.url, running: true, pid: existing.pid } } logger.info('Gateway for profile "%s" is alive but unhealthy (PID: %d, port: %d), restarting', name, existing.pid, existing.port) try { await this.stop(name) } catch (err) { logger.debug('Failed to stop unhealthy gateway before restart: %s', err) } } const endpoint = await this.resolvePort(name) return this.startResolved(name, endpoint) } /** 使用已经解析好的端口启动网关,避免 startAll() 中重复分配端口 */ private async startResolved(name: string, endpoint: ResolvedGatewayEndpoint): Promise { const { port, host } = endpoint const hermesHome = this.profileDir(name) const url = buildHttpUrl(host, port) // Windows 特定:清理僵尸锁定文件 if (process.platform === 'win32') { const lockPath = join(hermesHome, 'gateway.lock') if (existsSync(lockPath)) { try { const content = readFileSync(lockPath, 'utf-8').trim() const lockData = JSON.parse(content) const pid = lockData.pid if (pid && !this.isProcessAlive(pid)) { logger.warn('Found stale gateway lock file (PID: %d), attempting cleanup', pid) try { // 使用 Node.js 内置方法删除文件,避免 PowerShell 弹窗 unlinkSync(lockPath) logger.info('Successfully removed stale lock file') } catch (err) { logger.debug('Failed to remove lock file: %s', err) } } } catch (err) { logger.debug('Failed to check lock file: %s', err) } } } // 所有平台统一使用 run 模式;dev/nodemon 可通过 env 保留 gateway 进程。 return new Promise((resolve, reject) => { const env = { ...process.env, HERMES_HOME: hermesHome } const detachGateway = shouldDetachGatewayProcess() const child = spawn(HERMES_BIN, ['gateway', 'run', '--replace'], { stdio: 'ignore', detached: detachGateway, windowsHide: true, env, }) if (detachGateway) { child.unref() } const pid = child.pid ?? 0 logger.info('Starting gateway for profile "%s" (run mode, PID: %d, port: %d, detached: %s)', name, pid, port, detachGateway) // 保存子进程引用,用于后续管理 this.gateways.set(name, { pid, port, host, url, owned: true, process: child }) this.waitForReady(name, pid, port, host, url) .then(resolve) .catch(reject) }) } /** 等待网关健康检查通过,最多 15 秒 */ private async waitForReady(name: string, pid: number, port: number, host: string, url: string): Promise { const deadline = Date.now() + 15000 while (Date.now() < deadline) { if (pid && !this.isProcessAlive(pid)) { throw new Error(`Gateway process exited unexpectedly (PID: ${pid})`) } if (await this.checkHealth(url, 2000)) { // "gateway start" 自行管理进程,重新从 pid 文件读取实际 PID const actualPid = this.readPidFile(name) ?? pid const previous = this.gateways.get(name) this.gateways.set(name, { pid: actualPid, port, host, url, owned: previous?.owned ?? true, process: previous?.process, }) return { profile: name, port, host, url, running: true, pid: actualPid || undefined } } await new Promise(r => setTimeout(r, 500)) } throw new Error(`Gateway health check timed out after 15000ms`) } private async getListeningPids(port: number): Promise { try { if (process.platform === 'win32') { const { stdout } = await execFileAsync('netstat', ['-ano', '-p', 'tcp'], { timeout: 5000, windowsHide: true, }) const pids = new Set() for (const line of stdout.split(/\r?\n/)) { const parts = line.trim().split(/\s+/) if (parts.length < 5 || parts[0].toUpperCase() !== 'TCP') continue const localAddress = parts[1] const state = parts[3]?.toUpperCase() const pid = parseInt(parts[4], 10) if (state === 'LISTENING' && localAddress.endsWith(`:${port}`) && Number.isFinite(pid)) { pids.add(pid) } } return Array.from(pids) } const { stdout } = await execFileAsync('lsof', [`-tiTCP:${port}`, '-sTCP:LISTEN'], { timeout: 5000, }) return stdout .split(/\r?\n/) .map(line => parseInt(line.trim(), 10)) .filter(pid => Number.isFinite(pid)) } catch { return [] } } private async killPid(pid: number, force = false): Promise { if (!pid) return if (process.platform === 'win32') { try { await execFileAsync('taskkill', ['/PID', String(pid), '/T', '/F'], { timeout: 5000, windowsHide: true, }) } catch { try { process.kill(pid) } catch { } } return } const signal = force ? 'SIGKILL' : 'SIGTERM' try { process.kill(-pid, signal) } catch { try { process.kill(pid, signal) } catch { } } } private async stopViaHermesCli(name: string): Promise { const hermesHome = this.profileDir(name) try { const { stdout, stderr } = await execFileAsync(HERMES_BIN, ['gateway', 'stop'], { timeout: 15000, windowsHide: true, env: { ...process.env, HERMES_HOME: hermesHome }, }) const output = `${stdout}${stderr}`.trim() if (output) logger.debug('%s: hermes gateway stop: %s', name, output) } catch (err) { logger.debug('Failed to stop gateway via Hermes CLI for profile "%s": %s', name, err) } } /** * 停止单个 profile 的网关 * 所有平台使用 run 模式,直接 kill 进程 * 返回前等待 health check 确认网关已真正停止 */ async stop(name: string, timeoutMs = 10000): Promise { // 记录当前 URL,用于确认停止 const gw = this.gateways.get(name) const configured = this.readProfilePort(name) const port = gw?.port ?? configured.port const host = gw?.host ?? configured.host const url = gw?.url || buildHttpUrl(host, port) // 所有平台使用 run 模式,直接杀进程 const pids = new Set() if (gw?.process?.pid) pids.add(gw.process.pid) if (gw?.pid) pids.add(gw.pid) const pidFilePid = this.readPidFile(name) if (pidFilePid) pids.add(pidFilePid) if (isLocalHost(host)) { for (const pid of await this.getListeningPids(port)) { pids.add(pid) } } if (pids.size === 0) { if (!(await this.checkHealth(url, 1000))) { this.gateways.delete(name) this.allocatedPorts.delete(port) this.clearPidFile(name) logger.info('Stopped gateway for profile "%s" (already stopped)', name) return } await this.stopViaHermesCli(name) if (!(await this.checkHealth(url, 1000))) { this.gateways.delete(name) this.allocatedPorts.delete(port) this.clearPidFile(name) logger.info('Stopped gateway for profile "%s"', name) return } throw new Error(`Cannot stop gateway for profile "${name}": no PID available`) } await this.stopViaHermesCli(name) if (!(await this.checkHealth(url, 1000))) { this.gateways.delete(name) this.allocatedPorts.delete(port) this.clearPidFile(name) logger.info('Stopped gateway for profile "%s"', name) return } if (gw?.process && !gw.process.killed) { try { gw.process.kill(process.platform === 'win32' ? undefined : 'SIGTERM') } catch { } } for (const pid of pids) { await this.killPid(pid) } // 等待 health check 失败,确认网关已真正停止 const deadline = Date.now() + timeoutMs while (Date.now() < deadline) { if (!(await this.checkHealth(url, 1000))) { this.gateways.delete(name) this.allocatedPorts.delete(port) this.clearPidFile(name) logger.info('Stopped gateway for profile "%s"', name) return } await new Promise(r => setTimeout(r, 300)) } if (isLocalHost(host)) { const listeningPids = await this.getListeningPids(port) if (listeningPids.length) { logger.warn( 'Gateway for profile "%s" still listening on port %d, force killing PIDs: %s', name, port, listeningPids.join(', '), ) for (const pid of listeningPids) { await this.killPid(pid, true) } const forceDeadline = Date.now() + 3000 while (Date.now() < forceDeadline) { if (!(await this.checkHealth(url, 500))) { this.gateways.delete(name) this.allocatedPorts.delete(port) this.clearPidFile(name) logger.info('Stopped gateway for profile "%s" (force killed)', name) return } await new Promise(r => setTimeout(r, 200)) } } } logger.warn('Failed to stop gateway for profile "%s" within %dms', name, timeoutMs) throw new Error(`Gateway stop timed out after ${timeoutMs}ms`) } /** 停止所有已管理的网关(并行执行) */ async stopAll(): Promise { const entries = Array.from(this.gateways.entries()) .filter(([, gw]) => gw.owned) .map(([name]) => name) await Promise.allSettled(entries.map(name => this.stop(name))) } // ============================ // 批量操作(启动时调用) // ============================ /** 扫描所有 profile,检测网关运行状态并注册 */ async detectAllOnStartup(): Promise { logger.info('Scanning profiles for running gateways...') const profiles = await this.listProfiles() for (const name of profiles) { const status = await this.detectStatus(name) if (status.running) { logger.info('%s: running (PID: %s, port: %d)', name, status.pid, status.port) } else { logger.debug('%s: stopped', name) } } } /** * 启动所有未运行的网关 * * 两阶段执行: * Phase 1 — 顺序处理:检查状态、清理旧进程、分配端口 * Phase 2 — 并行启动网关进程 */ async startAll(): Promise { // 清空已分配端口集合,确保每次启动都从干净状态开始 this.allocatedPorts.clear() const profiles = await this.listProfiles() // Phase 1: 顺序处理 const toStart: Array<{ name: string; endpoint: ResolvedGatewayEndpoint }> = [] for (const name of profiles) { const existing = this.gateways.get(name) if (existing && this.isProcessAlive(existing.pid)) { if (await this.checkHealth(existing.url, 1000)) { logger.info('%s: already running (PID: %d, port: %d)', name, existing.pid, existing.port) continue } logger.info('%s: process alive but unhealthy (PID: %d, port: %d), restarting', name, existing.pid, existing.port) try { await this.stop(name) } catch (err) { logger.debug('Failed to stop unhealthy gateway: %s', err) } } // Skip remote profiles — local hermes command cannot start remote gateways const { host } = this.readProfilePort(name) if (host && host !== '127.0.0.1' && host !== 'localhost') { logger.info('%s: remote profile (host=%s), skipping auto-start', name, host) continue } // 有 PID 文件但进程未在正确端口运行 → 通过 health check 检查网关状态 const pid = this.readPidFile(name) if (pid && this.isProcessAlive(pid)) { const { port: configuredPort, host } = this.readProfilePort(name) const configuredUrl = buildHttpUrl(host, configuredPort) // 检查配置文件中的端口是否有正常的网关在运行 if (await this.checkHealth(configuredUrl, 2000)) { // Health check 通过,说明网关正常工作 logger.info('%s: gateway already running on configured port %d (PID: %d, health check passed)', name, configuredPort, pid) // 注册到内存中 this.gateways.set(name, { pid, port: configuredPort, host, url: configuredUrl, owned: false }) continue } else { // Health check 失败,说明网关有问题(僵尸进程或端口冲突) logger.info('%s: stale process (PID: %d) health check failed on port %d, stopping and restarting', name, pid, configuredPort) try { await this.stop(name) } catch (err) { logger.debug('Failed to stop stale gateway: %s', err) } // 清理过期的 PID 文件 this.clearPidFile(name) } } // 只为真正需要启动的网关分配端口 const endpoint = await this.resolvePort(name) toStart.push({ name, endpoint }) } // Phase 2: 并行启动 // 串行启动网关,避免并发时的lock file竞争条件 for (const { name, endpoint } of toStart) { try { await this.startResolved(name, endpoint) } catch (err: any) { logger.error(err, '%s: failed to start', name) } } } }