Files
Hermes-ui/packages/server/src/services/hermes/gateway-manager.ts
T

906 lines
31 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* GatewayManager — 多 Profile 网关生命周期管理
*
* 核心职责:
* 1. 启动时检测所有 profile 的网关运行状态(PID、端口、健康检查)
* 2. 自动发现端口冲突并重新分配
* 3. 启动/停止网关进程
*
* 启动检测流程(detectStatus):
* ① 读取 gateway.pid → 获取 PID
* ② 读取 config.yaml (platforms.api_server.extra.port/host) → 获取配置端口
* ③ PID 存活?
* - 否 → 标记为 stopped
* - 是 → 继续
* ④ 对配置端口做 health check
* - 通过 → 配置与运行状态匹配,注册网关
* - 失败 → 用 lsof 查 PID 实际监听端口
* ⑤ 实际端口 ≠ 配置端口?
* - 是 → 更新 config.yaml 到实际端口,重新 health check,通过则注册
* - 否 → 标记为 stopped
*
* 端口分配流程(resolvePort,启动前调用):
* ① 读取配置端口
* ② 检查是否被已管理的网关占用
* ③ 检查是否被外部系统进程占用(TCP bind 测试)
* ④ 冲突则从 base+1 递增找空闲端口,并写入 config.yaml
*
* 启动模式:
* - 正常系统(macOS/Linux):hermes gateway start/stop(系统服务管理)
* - WSL / Dockerhermes gateway rundetached 子进程,手动 kill
*/
import { spawn, execSync, type ChildProcess } from 'child_process'
import { resolve, join } from 'path'
import { homedir } from 'os'
import { readFileSync, writeFileSync, existsSync, readdirSync, unlinkSync } from 'fs'
import { execFile } from 'child_process'
import { promisify } from 'util'
import { createServer } from 'net'
import yaml from 'js-yaml'
import { logger } from '../logger'
import { detectHermesHome, getHermesBin } from './hermes-path'
const execFileAsync = promisify(execFile)
// ============================
// 常量 & 环境检测
// ============================
const HERMES_BASE = detectHermesHome()
const HERMES_BIN = getHermesBin()
/**
* 检测系统的 init 系统(服务管理器)
* - macOS → launchd
* - Windows → windows-service
* - Linux → systemd / sysvinit / other
*
* 没有 systemd/launchd/windows-service 的环境需要用 "gateway run" 代替 "gateway start"
* (适用于 WSL/Docker/Termux/proot 等无服务管理器的环境)
*/
function detectInitSystem(): string {
const platform = process.platform
// macOS → launchd
if (platform === 'darwin') {
return 'launchd'
}
// Windows → Service Manager
if (platform === 'win32') {
return 'windows-service'
}
// Linux 才检查 /proc
if (platform === 'linux') {
try {
if (existsSync('/.dockerenv') || existsSync('/run/.containerenv')) {
return 'container'
}
const comm = readFileSync('/proc/1/comm', 'utf-8').trim()
if (comm === 'systemd') {
return existsSync('/run/systemd/system') ? 'systemd' : 'other'
}
if (comm === 'init') return 'sysvinit'
return 'other'
} catch {
return 'unknown'
}
}
return 'unknown'
}
// 注意:虽然此函数仍然存在,但当前所有平台都统一使用 run 模式
// 保留此函数是为了将来如果需要切换回 start/stop 模式时可以参考
const initSystem = detectInitSystem()
/**
* 所有平台统一使用 run 模式
* run 模式会自动处理锁定文件冲突(--replace 标志),更可靠
* 子进程跟随父进程生命周期,父进程关闭时子进程自动关闭
*/
const needsRunMode = true
// 启动时输出 init 系统检测结果(方便调试)
logger.debug('Detected init system: %s (needsRunMode: %s, platform: %s)', initSystem, needsRunMode, process.platform)
// ============================
// 类型定义
// ============================
export interface GatewayStatus {
profile: string
port: number
host: string
url: string
running: boolean
pid?: number
}
interface ManagedGateway {
pid: number
port: number
host: string
url: string
owned: boolean
process?: ChildProcess
}
interface ResolvedGatewayEndpoint {
port: number
host: string
}
function formatHostForUrl(host: string): string {
if (host.startsWith('[') && host.endsWith(']')) return host
return host.includes(':') ? `[${host}]` : host
}
function buildHttpUrl(host: string, port: number): string {
return `http://${formatHostForUrl(host)}:${port}`
}
function isLocalHost(host: string): boolean {
return ['127.0.0.1', 'localhost', '::1', '[::1]', '0.0.0.0'].includes(host)
}
function shouldDetachGatewayProcess(): boolean {
const override = process.env.HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN?.trim().toLowerCase()
return override === '0' || override === 'false'
}
// ============================
// GatewayManager
// ============================
export class GatewayManager {
/** 已注册的网关:profile name → { pid, port, host, url } */
private gateways = new Map<string, ManagedGateway>()
/** 本次启动过程中已分配的端口集合(防止并发分配到相同端口) */
private allocatedPorts = new Set<number>()
/** 当前活跃的 profile(用于代理路由的默认上游) */
private activeProfile: string
constructor(activeProfile: string) {
this.activeProfile = activeProfile
}
// ============================
// Profile 目录 & 配置读取
// ============================
/** 获取 profile 的 home 目录路径 */
private profileDir(name: string): string {
if (name === 'default') return HERMES_BASE
return join(HERMES_BASE, 'profiles', name)
}
/**
* 从 profile 的 config.yaml 读取 api_server 端口和主机
* 读取路径:platforms.api_server.extra.port / extra.host
*/
private readProfilePort(name: string): { port: number; host: string } {
const configPath = join(this.profileDir(name), 'config.yaml')
const defaultHost = process.env.GATEWAY_HOST || '127.0.0.1'
if (!existsSync(configPath)) return { port: 8642, host: defaultHost }
try {
const content = readFileSync(configPath, 'utf-8')
const cfg = yaml.load(content, { json: true }) as any || {}
const extra = cfg?.platforms?.api_server?.extra
const rawPort = extra?.port || 8642
const port = typeof rawPort === 'number' ? rawPort : parseInt(rawPort, 10) || 8642
const host = extra?.host || defaultHost
// 端口超出合法范围时回退到默认值
return { port: port > 0 && port <= 65535 ? port : 8642, host }
} catch {
return { port: 8642, host: defaultHost }
}
}
/** Read a profile gateway PID, falling back to runtime state when gateway.pid is missing. */
private readPidFile(name: string): number | null {
const profilePath = this.profileDir(name)
const pidPath = join(profilePath, 'gateway.pid')
try {
if (existsSync(pidPath)) {
const content = readFileSync(pidPath, 'utf-8').trim()
const data = JSON.parse(content)
return typeof data.pid === 'number' ? data.pid : parseInt(data.pid, 10) || null
}
} catch {}
const statePath = join(profilePath, 'gateway_state.json')
if (!existsSync(statePath)) return null
try {
const content = readFileSync(statePath, 'utf-8').trim()
const data = JSON.parse(content)
const pid = typeof data.pid === 'number' ? data.pid : parseInt(data.pid, 10) || null
const state = data?.gateway_state
return pid && Number.isFinite(pid) && pid > 0 && (state === 'running' || state === 'starting') ? pid : null
} catch {
return null
}
}
// ============================
// 进程 & 端口检测工具
// ============================
/** Check process liveness without sending a terminating signal. */
private isProcessAlive(pid: number): boolean {
try {
process.kill(pid, 0)
return true
} catch (err: any) {
return err?.code === 'EPERM'
}
}
/** 请求 /health 端点,判断网关是否真正就绪 */
private async checkHealth(url: string, timeoutMs = 3000): Promise<boolean> {
try {
const res = await fetch(`${url.replace(/\/$/, '')}/health`, {
signal: AbortSignal.timeout(timeoutMs),
})
return res.ok
} catch {
return false
}
}
/** 尝试绑定端口,检测端口是否被系统级进程占用 */
/** 清理过期的 PID 文件 */
private clearPidFile(name: string): void {
try {
const pidPath = join(this.profileDir(name), 'gateway.pid')
if (existsSync(pidPath)) {
unlinkSync(pidPath)
logger.debug('Cleared stale PID file for profile "%s"', name)
}
} catch (err) {
logger.debug('Failed to clear PID file: %s', err)
}
}
/** 从 base 端口开始递增查找空闲端口(上限 65535) */
private findFreePort(base: number, host = '127.0.0.1', reservedPorts = new Set<number>()): Promise<number> {
return new Promise((resolve, reject) => {
const tryPort = (port: number) => {
if (port > 65535) {
reject(new Error(`No free port found in range ${base}-65535`))
return
}
if (reservedPorts.has(port)) {
tryPort(port + 1)
return
}
const server = createServer()
server.once('error', () => {
server.close()
tryPort(port + 1)
})
server.once('listening', () => {
server.close()
resolve(port)
})
server.listen(port, host)
}
tryPort(base)
})
}
// ============================
// 配置写入
// ============================
/**
* 将端口和主机写入 profile 的 config.yaml
* 写入完整结构:
* platforms:
* api_server:
* enabled: true
* key: ''
* cors_origins: '*'
* extra:
* port: <port>
* host: <host>
* 同时清理旧的顶层 port/host(避免 Hermes 读取错误)
*/
private writeProfilePort(name: string, port: number, host: string): void {
const configPath = join(this.profileDir(name), 'config.yaml')
try {
const content = existsSync(configPath) ? readFileSync(configPath, 'utf-8') : ''
const cfg = (yaml.load(content, { json: true }) as any) || {}
// 确保 platforms.api_server 结构存在(不会影响其他位置的 platforms)
if (!cfg.platforms) cfg.platforms = {}
if (!cfg.platforms.api_server) cfg.platforms.api_server = {}
if (!cfg.platforms.api_server.extra) cfg.platforms.api_server.extra = {}
cfg.platforms.api_server.enabled = true
cfg.platforms.api_server.key = ''
cfg.platforms.api_server.cors_origins = '*'
cfg.platforms.api_server.extra.port = port
cfg.platforms.api_server.extra.host = host
// 清理旧的顶层 port/hostHermes 只从 extra 读取
if (cfg.platforms.api_server.port !== undefined) {
delete cfg.platforms.api_server.port
}
if (cfg.platforms.api_server.host !== undefined) {
delete cfg.platforms.api_server.host
}
writeFileSync(configPath, yaml.dump(cfg, { lineWidth: -1 }), 'utf-8')
logger.debug('Updated %s: api_server.extra.port = %d', configPath, port)
} catch (err) {
logger.error(err, 'Failed to write config for profile "%s"', name)
}
}
// ============================
// 端口分配
// ============================
/**
* 为 profile 分配可用端口(启动前调用)
*
* 检测顺序:
* 1. 当前 profile 已经健康运行 → 直接使用运行端口
* 2. 未运行 → 从 8642 开始找空闲端口
* 3. 检查已管理 profile / 本轮已分配端口 / 系统 TCP 占用
* 4. 先写入 config.yaml,再启动 gateway
*/
private async resolvePort(name: string): Promise<{ port: number; host: string }> {
const { port: configuredPort, host } = this.readProfilePort(name)
const configuredUrl = buildHttpUrl(host, configuredPort)
// 检查是否是当前 profile 自己的端口(内存中的记录)
const existing = this.gateways.get(name)
if (existing && existing.host === host && this.isProcessAlive(existing.pid) && await this.checkHealth(existing.url, 1000)) {
// 如果内存中有记录且进程存活,直接使用内存中的端口
logger.info('Profile "%s" already running on port %d (in-memory record)', name, existing.port)
this.allocatedPorts.add(existing.port)
return { port: existing.port, host }
}
// 检查 PID 文件指向的当前 profile 是否仍健康运行
const pid = this.readPidFile(name)
if (pid && this.isProcessAlive(pid) && await this.checkHealth(configuredUrl, 1000)) {
logger.info('Profile "%s" already running on configured port %d (PID: %d)', name, configuredPort, pid)
this.gateways.set(name, { pid, port: configuredPort, host, url: configuredUrl, owned: false })
this.allocatedPorts.add(configuredPort)
return { port: configuredPort, host }
}
// 如果没有 PID 文件也没有内存记录,不认领端口上的未知网关
// 如果端口被占用,findFreePort 会分配新端口
// 收集已占用端口:本次启动已分配的端口 + 其他 profile 的网关端口
const usedPorts = new Set<number>(this.allocatedPorts)
for (const [profileName, gw] of Array.from(this.gateways.entries())) {
// 跳过当前 profile 自己的端口
if (profileName === name) continue
if (gw.host === host && this.isProcessAlive(gw.pid)) {
usedPorts.add(gw.port)
}
}
const port = await this.findFreePort(8642, host, usedPorts)
if (configuredPort !== port) {
logger.info('Assigning port for profile "%s": %d → %d', name, configuredPort, port)
} else {
logger.debug('Assigning port %d for profile "%s"', port, name)
}
this.writeProfilePort(name, port, host)
this.allocatedPorts.add(port)
return { port, host }
}
// ============================
// 公开方法:状态查询
// ============================
/** 获取指定 profile 的网关 URL(代理路由使用) */
getUpstream(profileName?: string): string {
const name = profileName || this.activeProfile
const gw = this.gateways.get(name)
if (gw?.url) return gw.url
const { port, host } = this.readProfilePort(name)
return buildHttpUrl(host, port)
}
/** 读取 profile 的 API_SERVER_KEY(从 .env 文件) */
getApiKey(profileName?: string): string | null {
const name = profileName || this.activeProfile
try {
const envPath = join(this.profileDir(name), '.env')
if (!existsSync(envPath)) return null
const content = readFileSync(envPath, 'utf-8')
const match = content.match(/^API_SERVER_KEY\s*=\s*"?([^"\n]+)"?/m)
return match?.[1]?.trim() || null
} catch {
return null
}
}
getActiveProfile(): string {
return this.activeProfile
}
setActiveProfile(name: string) {
this.activeProfile = name
}
/** 列出所有已知 profile 名称(通过 hermes CLI 或文件系统扫描) */
async listProfiles(): Promise<string[]> {
try {
const { stdout } = await execFileAsync(HERMES_BIN, ['profile', 'list'], {
timeout: 10000,
windowsHide: true,
})
const profiles: string[] = []
for (const line of stdout.trim().split('\n')) {
if (line.startsWith(' Profile') || line.match(/^ ─/)) continue
const match = line.match(/^\s+(?:◆)?(.+?)\s+/)
if (match) profiles.push(match[1])
}
return profiles
} catch {
// CLI 不可用时回退到文件系统扫描
const profiles = ['default']
const profilesDir = join(HERMES_BASE, 'profiles')
if (existsSync(profilesDir)) {
for (const entry of readdirSync(profilesDir, { withFileTypes: true })) {
if (entry.isDirectory() && existsSync(join(profilesDir, entry.name, 'config.yaml'))) {
profiles.push(entry.name)
}
}
}
return profiles
}
}
/**
* 检测单个 profile 的网关状态(只读,不修改任何进程或配置)
*
* 流程:
* ① 读 PID 文件 → 检查进程是否存活
* ② 读配置端口 → health check
* ③ 两者都通过 → 匹配,注册
* ④ 否则 → 标记为未运行(不杀进程,由 startAll 处理)
*/
async detectStatus(name: string): Promise<GatewayStatus> {
const pid = this.readPidFile(name)
const { port, host } = this.readProfilePort(name)
const url = buildHttpUrl(host, port)
// 首先检查 PID 文件:如果存在且进程存活且健康,则标记为运行
if (pid && this.isProcessAlive(pid) && await this.checkHealth(url)) {
this.gateways.set(name, { pid, port, host, url, owned: false })
return { profile: name, port, host, url, running: true, pid }
}
// 没有 PID 文件时不认领端口上的未知网关,避免误判其他 profile 的网关
this.gateways.delete(name)
return { profile: name, port, host, url, running: false }
}
/** 检测所有 profile 的网关状态 */
async listAll(): Promise<GatewayStatus[]> {
const profiles = await this.listProfiles()
const statuses = await Promise.all(profiles.map(name => this.detectStatus(name)))
return statuses
}
// ============================
// 公开方法:启动 & 停止
// ============================
/**
* 启动单个 profile 的网关
* 启动前自动调用 resolvePort() 确保端口可用且配置完整
*/
async start(name: string): Promise<GatewayStatus> {
// 检查是否已在运行
const existing = this.gateways.get(name)
if (existing && this.isProcessAlive(existing.pid)) {
if (await this.checkHealth(existing.url, 1000)) {
logger.info('Gateway for profile "%s" already running (PID: %d, port: %d)', name, existing.pid, existing.port)
return { profile: name, port: existing.port, host: existing.host, url: existing.url, running: true, pid: existing.pid }
}
logger.info('Gateway for profile "%s" is alive but unhealthy (PID: %d, port: %d), restarting',
name, existing.pid, existing.port)
try {
await this.stop(name)
} catch (err) {
logger.debug('Failed to stop unhealthy gateway before restart: %s', err)
}
}
const endpoint = await this.resolvePort(name)
return this.startResolved(name, endpoint)
}
/** 使用已经解析好的端口启动网关,避免 startAll() 中重复分配端口 */
private async startResolved(name: string, endpoint: ResolvedGatewayEndpoint): Promise<GatewayStatus> {
const { port, host } = endpoint
const hermesHome = this.profileDir(name)
const url = buildHttpUrl(host, port)
// Windows 特定:清理僵尸锁定文件
if (process.platform === 'win32') {
const lockPath = join(hermesHome, 'gateway.lock')
if (existsSync(lockPath)) {
try {
const content = readFileSync(lockPath, 'utf-8').trim()
const lockData = JSON.parse(content)
const pid = lockData.pid
if (pid && !this.isProcessAlive(pid)) {
logger.warn('Found stale gateway lock file (PID: %d), attempting cleanup', pid)
try {
// 使用 Node.js 内置方法删除文件,避免 PowerShell 弹窗
unlinkSync(lockPath)
logger.info('Successfully removed stale lock file')
} catch (err) {
logger.debug('Failed to remove lock file: %s', err)
}
}
} catch (err) {
logger.debug('Failed to check lock file: %s', err)
}
}
}
// 所有平台统一使用 run 模式;dev/nodemon 可通过 env 保留 gateway 进程。
return new Promise((resolve, reject) => {
const env = { ...process.env, HERMES_HOME: hermesHome }
const detachGateway = shouldDetachGatewayProcess()
const child = spawn(HERMES_BIN, ['gateway', 'run', '--replace'], {
stdio: 'ignore',
detached: detachGateway,
windowsHide: true,
env,
})
if (detachGateway) {
child.unref()
}
const pid = child.pid ?? 0
logger.info('Starting gateway for profile "%s" (run mode, PID: %d, port: %d, detached: %s)', name, pid, port, detachGateway)
// 保存子进程引用,用于后续管理
this.gateways.set(name, { pid, port, host, url, owned: true, process: child })
this.waitForReady(name, pid, port, host, url)
.then(resolve)
.catch(reject)
})
}
/** 等待网关健康检查通过,最多 15 秒 */
private async waitForReady(name: string, pid: number, port: number, host: string, url: string): Promise<GatewayStatus> {
const deadline = Date.now() + 15000
while (Date.now() < deadline) {
if (pid && !this.isProcessAlive(pid)) {
throw new Error(`Gateway process exited unexpectedly (PID: ${pid})`)
}
if (await this.checkHealth(url, 2000)) {
// "gateway start" 自行管理进程,重新从 pid 文件读取实际 PID
const actualPid = this.readPidFile(name) ?? pid
const previous = this.gateways.get(name)
this.gateways.set(name, {
pid: actualPid,
port,
host,
url,
owned: previous?.owned ?? true,
process: previous?.process,
})
return { profile: name, port, host, url, running: true, pid: actualPid || undefined }
}
await new Promise(r => setTimeout(r, 500))
}
throw new Error(`Gateway health check timed out after 15000ms`)
}
private async getListeningPids(port: number): Promise<number[]> {
try {
if (process.platform === 'win32') {
const { stdout } = await execFileAsync('netstat', ['-ano', '-p', 'tcp'], {
timeout: 5000,
windowsHide: true,
})
const pids = new Set<number>()
for (const line of stdout.split(/\r?\n/)) {
const parts = line.trim().split(/\s+/)
if (parts.length < 5 || parts[0].toUpperCase() !== 'TCP') continue
const localAddress = parts[1]
const state = parts[3]?.toUpperCase()
const pid = parseInt(parts[4], 10)
if (state === 'LISTENING' && localAddress.endsWith(`:${port}`) && Number.isFinite(pid)) {
pids.add(pid)
}
}
return Array.from(pids)
}
const { stdout } = await execFileAsync('lsof', [`-tiTCP:${port}`, '-sTCP:LISTEN'], {
timeout: 5000,
})
return stdout
.split(/\r?\n/)
.map(line => parseInt(line.trim(), 10))
.filter(pid => Number.isFinite(pid))
} catch {
return []
}
}
private async killPid(pid: number, force = false): Promise<void> {
if (!pid) return
if (process.platform === 'win32') {
try {
await execFileAsync('taskkill', ['/PID', String(pid), '/T', '/F'], {
timeout: 5000,
windowsHide: true,
})
} catch {
try { process.kill(pid) } catch { }
}
return
}
const signal = force ? 'SIGKILL' : 'SIGTERM'
try {
process.kill(-pid, signal)
} catch {
try { process.kill(pid, signal) } catch { }
}
}
private async stopViaHermesCli(name: string): Promise<void> {
const hermesHome = this.profileDir(name)
try {
const { stdout, stderr } = await execFileAsync(HERMES_BIN, ['gateway', 'stop'], {
timeout: 15000,
windowsHide: true,
env: { ...process.env, HERMES_HOME: hermesHome },
})
const output = `${stdout}${stderr}`.trim()
if (output) logger.debug('%s: hermes gateway stop: %s', name, output)
} catch (err) {
logger.debug('Failed to stop gateway via Hermes CLI for profile "%s": %s', name, err)
}
}
/**
* 停止单个 profile 的网关
* 所有平台使用 run 模式,直接 kill 进程
* 返回前等待 health check 确认网关已真正停止
*/
async stop(name: string, timeoutMs = 10000): Promise<void> {
// 记录当前 URL,用于确认停止
const gw = this.gateways.get(name)
const configured = this.readProfilePort(name)
const port = gw?.port ?? configured.port
const host = gw?.host ?? configured.host
const url = gw?.url || buildHttpUrl(host, port)
// 所有平台使用 run 模式,直接杀进程
const pids = new Set<number>()
if (gw?.process?.pid) pids.add(gw.process.pid)
if (gw?.pid) pids.add(gw.pid)
const pidFilePid = this.readPidFile(name)
if (pidFilePid) pids.add(pidFilePid)
if (isLocalHost(host)) {
for (const pid of await this.getListeningPids(port)) {
pids.add(pid)
}
}
if (pids.size === 0) {
if (!(await this.checkHealth(url, 1000))) {
this.gateways.delete(name)
this.allocatedPorts.delete(port)
this.clearPidFile(name)
logger.info('Stopped gateway for profile "%s" (already stopped)', name)
return
}
await this.stopViaHermesCli(name)
if (!(await this.checkHealth(url, 1000))) {
this.gateways.delete(name)
this.allocatedPorts.delete(port)
this.clearPidFile(name)
logger.info('Stopped gateway for profile "%s"', name)
return
}
throw new Error(`Cannot stop gateway for profile "${name}": no PID available`)
}
await this.stopViaHermesCli(name)
if (!(await this.checkHealth(url, 1000))) {
this.gateways.delete(name)
this.allocatedPorts.delete(port)
this.clearPidFile(name)
logger.info('Stopped gateway for profile "%s"', name)
return
}
if (gw?.process && !gw.process.killed) {
try { gw.process.kill(process.platform === 'win32' ? undefined : 'SIGTERM') } catch { }
}
for (const pid of pids) {
await this.killPid(pid)
}
// 等待 health check 失败,确认网关已真正停止
const deadline = Date.now() + timeoutMs
while (Date.now() < deadline) {
if (!(await this.checkHealth(url, 1000))) {
this.gateways.delete(name)
this.allocatedPorts.delete(port)
this.clearPidFile(name)
logger.info('Stopped gateway for profile "%s"', name)
return
}
await new Promise(r => setTimeout(r, 300))
}
if (isLocalHost(host)) {
const listeningPids = await this.getListeningPids(port)
if (listeningPids.length) {
logger.warn(
'Gateway for profile "%s" still listening on port %d, force killing PIDs: %s',
name,
port,
listeningPids.join(', '),
)
for (const pid of listeningPids) {
await this.killPid(pid, true)
}
const forceDeadline = Date.now() + 3000
while (Date.now() < forceDeadline) {
if (!(await this.checkHealth(url, 500))) {
this.gateways.delete(name)
this.allocatedPorts.delete(port)
this.clearPidFile(name)
logger.info('Stopped gateway for profile "%s" (force killed)', name)
return
}
await new Promise(r => setTimeout(r, 200))
}
}
}
logger.warn('Failed to stop gateway for profile "%s" within %dms', name, timeoutMs)
throw new Error(`Gateway stop timed out after ${timeoutMs}ms`)
}
/** 停止所有已管理的网关(并行执行) */
async stopAll(): Promise<void> {
const entries = Array.from(this.gateways.entries())
.filter(([, gw]) => gw.owned)
.map(([name]) => name)
await Promise.allSettled(entries.map(name => this.stop(name)))
}
// ============================
// 批量操作(启动时调用)
// ============================
/** 扫描所有 profile,检测网关运行状态并注册 */
async detectAllOnStartup(): Promise<void> {
logger.info('Scanning profiles for running gateways...')
const profiles = await this.listProfiles()
for (const name of profiles) {
const status = await this.detectStatus(name)
if (status.running) {
logger.info('%s: running (PID: %s, port: %d)', name, status.pid, status.port)
} else {
logger.debug('%s: stopped', name)
}
}
}
/**
* 启动所有未运行的网关
*
* 两阶段执行:
* Phase 1 — 顺序处理:检查状态、清理旧进程、分配端口
* Phase 2 — 并行启动网关进程
*/
async startAll(): Promise<void> {
// 清空已分配端口集合,确保每次启动都从干净状态开始
this.allocatedPorts.clear()
const profiles = await this.listProfiles()
// Phase 1: 顺序处理
const toStart: Array<{ name: string; endpoint: ResolvedGatewayEndpoint }> = []
for (const name of profiles) {
const existing = this.gateways.get(name)
if (existing && this.isProcessAlive(existing.pid)) {
if (await this.checkHealth(existing.url, 1000)) {
logger.info('%s: already running (PID: %d, port: %d)', name, existing.pid, existing.port)
continue
}
logger.info('%s: process alive but unhealthy (PID: %d, port: %d), restarting',
name, existing.pid, existing.port)
try {
await this.stop(name)
} catch (err) {
logger.debug('Failed to stop unhealthy gateway: %s', err)
}
}
// Skip remote profiles — local hermes command cannot start remote gateways
const { host } = this.readProfilePort(name)
if (host && host !== '127.0.0.1' && host !== 'localhost') {
logger.info('%s: remote profile (host=%s), skipping auto-start', name, host)
continue
}
// 有 PID 文件但进程未在正确端口运行 → 通过 health check 检查网关状态
const pid = this.readPidFile(name)
if (pid && this.isProcessAlive(pid)) {
const { port: configuredPort, host } = this.readProfilePort(name)
const configuredUrl = buildHttpUrl(host, configuredPort)
// 检查配置文件中的端口是否有正常的网关在运行
if (await this.checkHealth(configuredUrl, 2000)) {
// Health check 通过,说明网关正常工作
logger.info('%s: gateway already running on configured port %d (PID: %d, health check passed)',
name, configuredPort, pid)
// 注册到内存中
this.gateways.set(name, { pid, port: configuredPort, host, url: configuredUrl, owned: false })
continue
} else {
// Health check 失败,说明网关有问题(僵尸进程或端口冲突)
logger.info('%s: stale process (PID: %d) health check failed on port %d, stopping and restarting',
name, pid, configuredPort)
try {
await this.stop(name)
} catch (err) {
logger.debug('Failed to stop stale gateway: %s', err)
}
// 清理过期的 PID 文件
this.clearPidFile(name)
}
}
// 只为真正需要启动的网关分配端口
const endpoint = await this.resolvePort(name)
toStart.push({ name, endpoint })
}
// Phase 2: 并行启动
// 串行启动网关,避免并发时的lock file竞争条件
for (const { name, endpoint } of toStart) {
try {
await this.startResolved(name, endpoint)
} catch (err: any) {
logger.error(err, '%s: failed to start', name)
}
}
}
}