add bridge performance monitoring

This commit is contained in:
ekko
2026-05-23 09:05:03 +08:00
committed by ekko
parent 4223014e0c
commit c184519c5d
21 changed files with 1778 additions and 91 deletions
@@ -0,0 +1,63 @@
import { request } from '../client'
export interface ProcessUsage {
pid: number
role: 'web' | 'broker' | 'worker'
profile?: string
running: boolean
cpuPercent: number
memoryRssBytes: number
command?: string
error?: string
}
export interface PerformanceRuntimeSnapshot {
timestamp: number
system: {
platform: string
arch: string
uptimeSeconds: number
cpuCount: number
cpuPercent: number
loadAverage: number[]
totalMemoryBytes: number
freeMemoryBytes: number
usedMemoryBytes: number
memoryPercent: number
}
web: {
pid: number
uptimeSeconds: number
memory: Record<string, number>
cpuPercent: number
}
bridge: {
endpoint: string
reachable: boolean
error?: string
broker: {
running: boolean
ready: boolean
pid?: number
process?: ProcessUsage
restartScheduled: boolean
restartAttempts: number
}
workers: Array<ProcessUsage & {
endpoint?: string
lastUsedAt?: number
sessionCount: number
runningSessionCount: number
}>
totalWorkerMemoryRssBytes: number
}
sessions: {
active: number
running: number
byProfile: Record<string, number>
}
}
export async function fetchPerformanceRuntime(): Promise<PerformanceRuntimeSnapshot> {
return request<PerformanceRuntimeSnapshot>('/api/hermes/performance/runtime')
}
@@ -226,10 +226,17 @@ function openChangelog() {
</svg>
<span>{{ t("sidebar.usage") }}</span>
</button>
<button class="nav-item" :class="{ active: selectedKey === 'hermes.skillsUsage' }" @click="handleNav('hermes.skillsUsage')">
<button class="nav-item" :class="{ active: selectedKey === 'hermes.performance' }" @click="handleNav('hermes.performance')">
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round">
<polyline points="22 12 18 12 15 21 9 3 6 12 2 12" />
</svg>
<span>{{ t("sidebar.performance") }}</span>
</button>
<button class="nav-item" :class="{ active: selectedKey === 'hermes.skillsUsage' }" @click="handleNav('hermes.skillsUsage')">
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round">
<path d="M21.21 15.89A10 10 0 1 1 8.11 2.79" />
<path d="M22 12A10 10 0 0 0 12 2v10z" />
</svg>
<span>{{ t("sidebar.skillsUsage") }}</span>
</button>
</div>
+31
View File
@@ -82,6 +82,7 @@ export default {
memory: 'Gedachtnis',
logs: 'Protokolle',
usage: 'Nutzung',
performance: 'Leistung',
skillsUsage: 'Skill-Nutzung',
channels: 'Kanale',
terminal: 'Konsole',
@@ -116,6 +117,36 @@ export default {
collapse: 'Menü einklappen',
},
performance: {
title: 'Leistung',
subtitle: 'Systemressourcen, Bridge Broker, Workers und aktive Sitzungen überwachen',
refresh: 'Aktualisieren',
autoRefreshOn: 'Automatisch aktualisieren',
autoRefreshOff: 'Manuell aktualisieren',
loadFailed: 'Leistungsdaten konnten nicht geladen werden',
systemCpu: 'System-CPU',
systemMemory: 'Systemspeicher',
activeSessions: 'Aktive Sitzungen',
runningSessions: 'Laufend {count}',
workers: 'Workers',
totalWorkerMemory: 'Worker-Gesamtspeicher',
processes: 'Prozesse',
uptime: 'Laufzeit',
running: 'Läuft',
stopped: 'Gestoppt',
workerMemory: 'Worker-Speicher',
lastUpdated: 'Aktualisiert',
profile: 'Profile',
memory: 'Speicher',
sessions: 'Sitzungen',
runningActiveSessions: 'Laufend / Aktiv',
lastUsed: 'Zuletzt verwendet',
status: 'Status',
noWorkers: 'Keine Workers',
sessionsByProfile: 'Sitzungen nach Profile',
noActiveSessions: 'Keine aktiven Sitzungen',
},
// Drawer
drawer: {
terminal: 'Konsole',
+31
View File
@@ -83,6 +83,7 @@ export default {
memory: 'Memory',
logs: 'Logs',
usage: 'Usage',
performance: 'Performance',
skillsUsage: 'Skills Usage',
channels: 'Channels',
gateways: 'Gateways',
@@ -116,6 +117,36 @@ export default {
noChangelog: 'No changelog available',
},
performance: {
title: 'Performance',
subtitle: 'Inspect system resources, bridge broker, workers, and active sessions',
refresh: 'Refresh',
autoRefreshOn: 'Auto refresh',
autoRefreshOff: 'Manual refresh',
loadFailed: 'Failed to load performance metrics',
systemCpu: 'System CPU',
systemMemory: 'System Memory',
activeSessions: 'Active Sessions',
runningSessions: 'Running {count}',
workers: 'Workers',
totalWorkerMemory: 'Worker memory',
processes: 'Processes',
uptime: 'Uptime',
running: 'Running',
stopped: 'Stopped',
workerMemory: 'Worker Memory',
lastUpdated: 'Updated',
profile: 'Profile',
memory: 'Memory',
sessions: 'Sessions',
runningActiveSessions: 'Running / Active',
lastUsed: 'Last Used',
status: 'Status',
noWorkers: 'No workers',
sessionsByProfile: 'Sessions by Profile',
noActiveSessions: 'No active sessions',
},
// Drawer
drawer: {
terminal: 'Terminal',
+31
View File
@@ -82,6 +82,7 @@ export default {
memory: 'Memoria',
logs: 'Registros',
usage: 'Uso',
performance: 'Rendimiento',
skillsUsage: 'Uso de habilidades',
channels: 'Canales',
terminal: 'Terminal',
@@ -116,6 +117,36 @@ export default {
collapse: 'Contraer menú',
},
performance: {
title: 'Rendimiento',
subtitle: 'Supervisa recursos del sistema, Bridge Broker, Workers y sesiones activas',
refresh: 'Actualizar',
autoRefreshOn: 'Actualización automática',
autoRefreshOff: 'Actualización manual',
loadFailed: 'No se pudieron cargar las métricas de rendimiento',
systemCpu: 'CPU del sistema',
systemMemory: 'Memoria del sistema',
activeSessions: 'Sesiones activas',
runningSessions: 'En ejecución {count}',
workers: 'Workers',
totalWorkerMemory: 'Memoria total de Worker',
processes: 'Procesos',
uptime: 'Tiempo activo',
running: 'En ejecución',
stopped: 'Detenido',
workerMemory: 'Memoria de Worker',
lastUpdated: 'Actualizado',
profile: 'Profile',
memory: 'Memoria',
sessions: 'Sesiones',
runningActiveSessions: 'En ejecución / Activas',
lastUsed: 'Último uso',
status: 'Estado',
noWorkers: 'Sin Workers',
sessionsByProfile: 'Sesiones por Profile',
noActiveSessions: 'No hay sesiones activas',
},
// Drawer
drawer: {
terminal: 'Terminal',
+31
View File
@@ -82,6 +82,7 @@ export default {
memory: 'Memoire',
logs: 'Journaux',
usage: 'Utilisation',
performance: 'Performance',
skillsUsage: 'Utilisation des compétences',
channels: 'Canaux',
terminal: 'Terminal',
@@ -116,6 +117,36 @@ export default {
collapse: 'Replier le menu',
},
performance: {
title: 'Performance',
subtitle: 'Surveiller les ressources système, Bridge Broker, Workers et sessions actives',
refresh: 'Actualiser',
autoRefreshOn: 'Actualisation auto',
autoRefreshOff: 'Actualisation manuelle',
loadFailed: 'Échec du chargement des métriques de performance',
systemCpu: 'CPU système',
systemMemory: 'Mémoire système',
activeSessions: 'Sessions actives',
runningSessions: 'En cours {count}',
workers: 'Workers',
totalWorkerMemory: 'Mémoire totale Worker',
processes: 'Processus',
uptime: 'Disponibilité',
running: 'En cours',
stopped: 'Arrêté',
workerMemory: 'Mémoire Worker',
lastUpdated: 'Mis à jour',
profile: 'Profile',
memory: 'Mémoire',
sessions: 'Sessions',
runningActiveSessions: 'En cours / Actives',
lastUsed: 'Dernière utilisation',
status: 'Statut',
noWorkers: 'Aucun Worker',
sessionsByProfile: 'Sessions par Profile',
noActiveSessions: 'Aucune session active',
},
// Drawer
drawer: {
terminal: 'Terminal',
+31
View File
@@ -82,6 +82,7 @@ export default {
memory: 'メモリ',
logs: 'ログ',
usage: '使用量',
performance: 'パフォーマンス',
skillsUsage: 'スキル使用状況',
channels: 'チャンネル',
terminal: 'ターミナル',
@@ -116,6 +117,36 @@ export default {
collapse: 'メニューを折りたたむ',
},
performance: {
title: 'パフォーマンス',
subtitle: 'システムリソース、Bridge Broker、Workers、アクティブセッションを確認',
refresh: '更新',
autoRefreshOn: '自動更新',
autoRefreshOff: '手動更新',
loadFailed: 'パフォーマンスデータの読み込みに失敗しました',
systemCpu: 'システム CPU',
systemMemory: 'システムメモリ',
activeSessions: 'アクティブセッション',
runningSessions: '実行中 {count}',
workers: 'Workers',
totalWorkerMemory: 'Worker 合計メモリ',
processes: 'プロセス',
uptime: '稼働時間',
running: '実行中',
stopped: '停止',
workerMemory: 'Worker メモリ',
lastUpdated: '更新時刻',
profile: 'Profile',
memory: 'メモリ',
sessions: 'セッション',
runningActiveSessions: '実行中 / アクティブ',
lastUsed: '最終使用',
status: '状態',
noWorkers: 'Worker はありません',
sessionsByProfile: 'Profile 別セッション',
noActiveSessions: 'アクティブセッションはありません',
},
// ドロワー
drawer: {
terminal: 'ターミナル',
+31
View File
@@ -82,6 +82,7 @@ export default {
memory: '메모리',
logs: '로그',
usage: '사용량',
performance: '성능 모니터링',
skillsUsage: '스킬 사용량',
channels: '채널',
terminal: '터미널',
@@ -116,6 +117,36 @@ export default {
collapse: '메뉴 접기',
},
performance: {
title: '성능 모니터링',
subtitle: '시스템 리소스, Bridge Broker, Workers, 활성 세션 확인',
refresh: '새로고침',
autoRefreshOn: '자동 새로고침',
autoRefreshOff: '수동 새로고침',
loadFailed: '성능 데이터를 불러오지 못했습니다',
systemCpu: '시스템 CPU',
systemMemory: '시스템 메모리',
activeSessions: '활성 세션',
runningSessions: '실행 중 {count}',
workers: 'Workers',
totalWorkerMemory: 'Worker 총 메모리',
processes: '프로세스',
uptime: '실행 시간',
running: '실행 중',
stopped: '중지됨',
workerMemory: 'Worker 메모리',
lastUpdated: '업데이트 시간',
profile: 'Profile',
memory: '메모리',
sessions: '세션',
runningActiveSessions: '실행 중 / 활성',
lastUsed: '마지막 사용',
status: '상태',
noWorkers: 'Worker 없음',
sessionsByProfile: 'Profile별 세션',
noActiveSessions: '활성 세션 없음',
},
// 서랍
drawer: {
terminal: '터미널',
+31
View File
@@ -82,6 +82,7 @@ export default {
memory: 'Memoria',
logs: 'Logs',
usage: 'Uso',
performance: 'Desempenho',
skillsUsage: 'Uso de habilidades',
channels: 'Canais',
terminal: 'Terminal',
@@ -116,6 +117,36 @@ export default {
collapse: 'Recolher menu',
},
performance: {
title: 'Desempenho',
subtitle: 'Monitore recursos do sistema, Bridge Broker, Workers e sessões ativas',
refresh: 'Atualizar',
autoRefreshOn: 'Atualização automática',
autoRefreshOff: 'Atualização manual',
loadFailed: 'Falha ao carregar métricas de desempenho',
systemCpu: 'CPU do sistema',
systemMemory: 'Memória do sistema',
activeSessions: 'Sessões ativas',
runningSessions: 'Em execução {count}',
workers: 'Workers',
totalWorkerMemory: 'Memória total de Worker',
processes: 'Processos',
uptime: 'Tempo ativo',
running: 'Em execução',
stopped: 'Parado',
workerMemory: 'Memória de Worker',
lastUpdated: 'Atualizado',
profile: 'Profile',
memory: 'Memória',
sessions: 'Sessões',
runningActiveSessions: 'Em execução / Ativas',
lastUsed: 'Último uso',
status: 'Status',
noWorkers: 'Nenhum Worker',
sessionsByProfile: 'Sessões por Profile',
noActiveSessions: 'Nenhuma sessão ativa',
},
// Gaveta
drawer: {
terminal: 'Terminal',
+31
View File
@@ -83,6 +83,7 @@ export default {
memory: '記憶',
logs: '日誌',
usage: '用量',
performance: '效能監控',
skillsUsage: '技能用量',
channels: '頻道',
gateways: '閘道',
@@ -116,6 +117,36 @@ export default {
noChangelog: '目前無更新日誌',
},
performance: {
title: '效能監控',
subtitle: '查看系統資源、Bridge Broker、Workers 和活躍會話',
refresh: '重新整理',
autoRefreshOn: '自動重新整理',
autoRefreshOff: '手動重新整理',
loadFailed: '效能資料載入失敗',
systemCpu: '系統 CPU',
systemMemory: '系統記憶體',
activeSessions: '活躍會話',
runningSessions: '執行中 {count}',
workers: 'Workers',
totalWorkerMemory: 'Worker 總記憶體',
processes: '程序',
uptime: '執行',
running: '執行中',
stopped: '已停止',
workerMemory: 'Worker 記憶體',
lastUpdated: '更新時間',
profile: 'Profile',
memory: '記憶體',
sessions: '會話',
runningActiveSessions: '執行中 / 活躍',
lastUsed: '最後使用',
status: '狀態',
noWorkers: '暫無 Worker',
sessionsByProfile: '按 Profile 統計會話',
noActiveSessions: '暫無活躍會話',
},
// 抽屜
drawer: {
terminal: '終端機',
+31
View File
@@ -83,6 +83,7 @@ export default {
memory: '记忆',
logs: '日志',
usage: '用量',
performance: '性能监控',
skillsUsage: '技能用量',
channels: '频道',
gateways: '网关',
@@ -116,6 +117,36 @@ export default {
noChangelog: '暂无更新日志',
},
performance: {
title: '性能监控',
subtitle: '查看系统资源、Bridge Broker、Workers 和活跃会话',
refresh: '刷新',
autoRefreshOn: '自动刷新',
autoRefreshOff: '手动刷新',
loadFailed: '性能数据加载失败',
systemCpu: '系统 CPU',
systemMemory: '系统内存',
activeSessions: '活跃会话',
runningSessions: '运行中 {count}',
workers: 'Workers',
totalWorkerMemory: 'Worker 总内存',
processes: '进程',
uptime: '运行',
running: '运行中',
stopped: '已停止',
workerMemory: 'Worker 内存',
lastUpdated: '更新时间',
profile: 'Profile',
memory: '内存',
sessions: '会话',
runningActiveSessions: '运行中 / 活跃',
lastUsed: '最后使用',
status: '状态',
noWorkers: '暂无 Worker',
sessionsByProfile: '按 Profile 统计会话',
noActiveSessions: '暂无活跃会话',
},
// 抽屉
drawer: {
terminal: '终端',
+5
View File
@@ -50,6 +50,11 @@ const router = createRouter({
name: 'hermes.usage',
component: () => import('@/views/hermes/UsageView.vue'),
},
{
path: '/hermes/performance',
name: 'hermes.performance',
component: () => import('@/views/hermes/PerformanceView.vue'),
},
{
path: '/hermes/skills-usage',
name: 'hermes.skillsUsage',
@@ -0,0 +1,486 @@
<script setup lang="ts">
import { computed, onBeforeUnmount, onMounted, ref } from 'vue'
import { NButton, NSpin, useMessage } from 'naive-ui'
import { useI18n } from 'vue-i18n'
import { fetchPerformanceRuntime, type PerformanceRuntimeSnapshot } from '@/api/hermes/performance-monitor'
const { t } = useI18n()
const message = useMessage()
const snapshot = ref<PerformanceRuntimeSnapshot | null>(null)
const loading = ref(false)
const autoRefresh = ref(true)
let timer: ReturnType<typeof setInterval> | undefined
const brokerMemory = computed(() => snapshot.value?.bridge.broker.process?.memoryRssBytes ?? null)
const webRssMemory = computed(() => snapshot.value?.web.memory.rss ?? null)
const workerCount = computed(() => snapshot.value?.bridge.workers.length ?? 0)
const runningWorkerCount = computed(() => snapshot.value?.bridge.workers.filter(worker => worker.running).length ?? 0)
function formatBytes(value?: number | null): string {
if (value == null || !Number.isFinite(value)) return '-'
const units = ['B', 'KB', 'MB', 'GB', 'TB']
let size = value
let unit = 0
while (size >= 1024 && unit < units.length - 1) {
size /= 1024
unit += 1
}
return `${size.toFixed(unit === 0 ? 0 : 1)} ${units[unit]}`
}
function formatPercent(value?: number | null): string {
return value == null || !Number.isFinite(value) ? '-' : `${value.toFixed(1)}%`
}
function formatDuration(seconds?: number | null): string {
if (seconds == null || !Number.isFinite(seconds)) return '-'
const days = Math.floor(seconds / 86400)
const hours = Math.floor((seconds % 86400) / 3600)
const minutes = Math.floor((seconds % 3600) / 60)
if (days > 0) return `${days}d ${hours}h`
if (hours > 0) return `${hours}h ${minutes}m`
return `${minutes}m`
}
function formatTime(seconds?: number): string {
if (!seconds) return '-'
return new Date(seconds * 1000).toLocaleString()
}
function statusText(running: boolean): string {
return running ? t('performance.running') : t('performance.stopped')
}
async function loadRuntime(showError = true) {
loading.value = true
try {
snapshot.value = await fetchPerformanceRuntime()
} catch (err: any) {
if (showError) message.error(err?.message || t('performance.loadFailed'))
} finally {
loading.value = false
}
}
function setAutoRefresh(enabled: boolean) {
autoRefresh.value = enabled
if (timer) {
clearInterval(timer)
timer = undefined
}
if (enabled) {
timer = setInterval(() => loadRuntime(false), 3000)
}
}
onMounted(() => {
loadRuntime()
setAutoRefresh(true)
})
onBeforeUnmount(() => {
if (timer) clearInterval(timer)
})
</script>
<template>
<div class="performance-view">
<header class="page-header">
<h2 class="header-title">{{ t('performance.title') }}</h2>
<div class="header-actions">
<NButton size="small" :type="autoRefresh ? 'primary' : 'default'" secondary @click="setAutoRefresh(!autoRefresh)">
{{ autoRefresh ? t('performance.autoRefreshOn') : t('performance.autoRefreshOff') }}
</NButton>
<NButton size="small" :loading="loading" @click="loadRuntime()">{{ t('performance.refresh') }}</NButton>
</div>
</header>
<NSpin :show="loading && !snapshot" class="performance-spin">
<main v-if="snapshot" class="performance-content">
<section class="summary-grid">
<div class="summary-item">
<span class="summary-label">{{ t('performance.systemCpu') }}</span>
<strong>{{ formatPercent(snapshot.system.cpuPercent) }}</strong>
<div class="meter"><span :style="{ width: `${snapshot.system.cpuPercent || 0}%` }" /></div>
</div>
<div class="summary-item">
<span class="summary-label">{{ t('performance.systemMemory') }}</span>
<strong>{{ formatPercent(snapshot.system.memoryPercent) }}</strong>
<small>{{ formatBytes(snapshot.system.usedMemoryBytes) }} / {{ formatBytes(snapshot.system.totalMemoryBytes) }}</small>
<div class="meter"><span :style="{ width: `${snapshot.system.memoryPercent || 0}%` }" /></div>
</div>
<div class="summary-item">
<span class="summary-label">{{ t('performance.activeSessions') }}</span>
<strong>{{ snapshot.sessions.active }}</strong>
<small>{{ t('performance.runningSessions', { count: snapshot.sessions.running }) }}</small>
</div>
<div class="summary-item">
<span class="summary-label">{{ t('performance.workers') }}</span>
<strong>{{ runningWorkerCount }} / {{ workerCount }}</strong>
<small>{{ t('performance.totalWorkerMemory') }} {{ formatBytes(snapshot.bridge.totalWorkerMemoryRssBytes) }}</small>
</div>
</section>
<section class="runtime-section">
<div class="section-header">
<h3>{{ t('performance.processes') }}</h3>
<span>{{ snapshot.system.platform }} {{ snapshot.system.arch }} · {{ snapshot.system.cpuCount }} CPU · {{ t('performance.uptime') }} {{ formatDuration(snapshot.system.uptimeSeconds) }}</span>
</div>
<div class="process-grid">
<div class="process-row">
<div>
<strong>Web UI</strong>
<span>PID {{ snapshot.web.pid }}</span>
</div>
<span>{{ formatPercent(snapshot.web.cpuPercent) }}</span>
<span>{{ formatBytes(webRssMemory) }}</span>
<span class="status running">{{ statusText(true) }}</span>
</div>
<div class="process-row">
<div>
<strong>Bridge Broker</strong>
<span>{{ snapshot.bridge.endpoint }}</span>
</div>
<span>{{ formatPercent(snapshot.bridge.broker.process?.cpuPercent) }}</span>
<span>{{ formatBytes(brokerMemory) }}</span>
<span class="status" :class="{ running: snapshot.bridge.reachable && snapshot.bridge.broker.running }">
{{ snapshot.bridge.reachable && snapshot.bridge.broker.running ? statusText(true) : statusText(false) }}
</span>
</div>
</div>
<div v-if="snapshot.bridge.error" class="runtime-error">{{ snapshot.bridge.error }}</div>
</section>
<section class="runtime-section">
<div class="section-header">
<h3>{{ t('performance.workerMemory') }}</h3>
<span>{{ t('performance.lastUpdated') }} {{ new Date(snapshot.timestamp).toLocaleTimeString() }}</span>
</div>
<div class="worker-table-wrap">
<table class="worker-table">
<thead>
<tr>
<th>{{ t('performance.profile') }}</th>
<th>PID</th>
<th>CPU</th>
<th>{{ t('performance.memory') }}</th>
<th>{{ t('performance.runningActiveSessions') }}</th>
<th>{{ t('performance.lastUsed') }}</th>
<th>{{ t('performance.status') }}</th>
</tr>
</thead>
<tbody>
<tr v-if="snapshot.bridge.workers.length === 0">
<td colspan="7" class="empty-cell">{{ t('performance.noWorkers') }}</td>
</tr>
<tr v-for="worker in snapshot.bridge.workers" :key="worker.profile || worker.pid">
<td>{{ worker.profile || '-' }}</td>
<td>{{ worker.pid || '-' }}</td>
<td>{{ formatPercent(worker.cpuPercent) }}</td>
<td>{{ formatBytes(worker.memoryRssBytes) }}</td>
<td>{{ worker.runningSessionCount }} / {{ worker.sessionCount }}</td>
<td>{{ formatTime(worker.lastUsedAt) }}</td>
<td><span class="status" :class="{ running: worker.running }">{{ statusText(worker.running) }}</span></td>
</tr>
</tbody>
</table>
</div>
</section>
<section class="runtime-section">
<div class="section-header">
<h3>{{ t('performance.sessionsByProfile') }}</h3>
</div>
<div class="session-list">
<div v-if="Object.keys(snapshot.sessions.byProfile).length === 0" class="session-empty">
{{ t('performance.noActiveSessions') }}
</div>
<div v-for="(count, profile) in snapshot.sessions.byProfile" :key="profile" class="session-row">
<span>{{ profile }}</span>
<strong>{{ count }}</strong>
</div>
</div>
</section>
</main>
</NSpin>
</div>
</template>
<style scoped lang="scss">
@use '@/styles/variables' as *;
.performance-view {
height: 100%;
display: flex;
flex-direction: column;
}
.page-header {
display: flex;
flex-shrink: 0;
align-items: center;
justify-content: space-between;
gap: 12px;
padding: 21px 20px;
border-bottom: 1px solid $border-color;
}
.header-title {
margin: 0;
color: $text-primary;
font-size: 16px;
font-weight: 600;
}
.header-actions {
display: flex;
align-items: center;
gap: 8px;
}
.performance-spin {
flex: 1;
min-height: 0;
}
.performance-content {
height: 100%;
overflow-y: auto;
padding: 20px;
}
.summary-grid {
display: grid;
grid-template-columns: repeat(4, minmax(0, 1fr));
gap: 12px;
margin-bottom: 16px;
}
.summary-item,
.runtime-section {
border: 1px solid $border-color;
border-radius: $radius-sm;
background: $bg-card;
}
.summary-item {
min-height: 108px;
padding: 14px;
display: flex;
flex-direction: column;
gap: 8px;
}
.summary-label,
.summary-item small,
.section-header span,
.process-row div span {
color: $text-muted;
font-size: 12px;
}
.summary-item strong {
color: $text-primary;
font-size: 24px;
font-weight: 650;
}
.meter {
height: 6px;
overflow: hidden;
border-radius: 999px;
background: $bg-secondary;
span {
display: block;
height: 100%;
border-radius: inherit;
background: $accent-primary;
}
}
.runtime-section {
margin-top: 12px;
overflow: hidden;
}
.section-header {
min-height: 46px;
padding: 12px 14px;
display: flex;
align-items: center;
justify-content: space-between;
gap: 12px;
border-bottom: 1px solid $border-light;
h3 {
margin: 0;
color: $text-primary;
font-size: 14px;
font-weight: 600;
}
}
.process-grid {
display: grid;
grid-template-columns: 1fr;
}
.process-row {
min-height: 56px;
padding: 10px 14px;
display: grid;
grid-template-columns: minmax(0, 1fr) 80px 110px 86px;
align-items: center;
gap: 12px;
border-bottom: 1px solid $border-light;
color: $text-secondary;
font-size: 13px;
&:last-child {
border-bottom: 0;
}
div {
min-width: 0;
display: flex;
flex-direction: column;
gap: 3px;
}
strong {
color: $text-primary;
font-size: 13px;
}
span {
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
}
.status {
width: fit-content;
max-width: 100%;
padding: 2px 8px;
border: 1px solid $border-color;
border-radius: 999px;
color: $text-muted;
font-size: 12px;
&.running {
border-color: rgba(var(--success-rgb), 0.35);
color: $success;
background: rgba(var(--success-rgb), 0.08);
}
}
.runtime-error {
padding: 10px 14px;
border-top: 1px solid $border-light;
color: $error;
font-size: 12px;
}
.worker-table-wrap {
overflow-x: auto;
}
.worker-table {
width: 100%;
min-width: 760px;
border-collapse: collapse;
color: $text-secondary;
font-size: 13px;
th,
td {
padding: 11px 14px;
border-bottom: 1px solid $border-light;
text-align: left;
white-space: nowrap;
}
th {
color: $text-muted;
font-size: 12px;
font-weight: 600;
}
td:first-child {
color: $text-primary;
font-weight: 600;
}
tr:last-child td {
border-bottom: 0;
}
}
.empty-cell,
.session-empty {
color: $text-muted;
text-align: center;
}
.session-list {
padding: 6px 14px;
}
.session-row {
min-height: 34px;
display: flex;
align-items: center;
justify-content: space-between;
gap: 12px;
border-bottom: 1px solid $border-light;
color: $text-secondary;
font-size: 13px;
&:last-child {
border-bottom: 0;
}
strong {
color: $text-primary;
}
}
.session-empty {
padding: 18px 0;
font-size: 13px;
}
@media (max-width: 960px) {
.summary-grid {
grid-template-columns: repeat(2, minmax(0, 1fr));
}
}
@media (max-width: $breakpoint-mobile) {
.page-header,
.header-actions,
.section-header {
align-items: flex-start;
flex-direction: column;
}
.header-actions {
width: 100%;
}
.summary-grid {
grid-template-columns: 1fr;
}
.process-row {
grid-template-columns: 1fr 72px;
> span:nth-child(3),
> span:nth-child(4) {
justify-self: start;
}
}
}
</style>
@@ -0,0 +1,9 @@
import { createEmptyOpsRuntimeSnapshot, getOpsRuntimeSnapshot } from '../../services/hermes/ops-monitor'
export async function runtime(ctx: any) {
try {
ctx.body = await getOpsRuntimeSnapshot()
} catch (err: any) {
ctx.body = createEmptyOpsRuntimeSnapshot(err?.message || 'Failed to read performance metrics')
}
}
@@ -0,0 +1,6 @@
import Router from '@koa/router'
import * as ctrl from '../../controllers/hermes/performance-monitor'
export const performanceMonitorRoutes = new Router()
performanceMonitorRoutes.get('/api/hermes/performance/runtime', ctrl.runtime)
+2
View File
@@ -31,6 +31,7 @@ import { ttsRoutes } from './hermes/tts'
import { mediaRoutes } from './hermes/media'
import { proxyRoutes, proxyMiddleware } from './hermes/proxy'
import { groupChatRoutes, setGroupChatServer } from './hermes/group-chat'
import { performanceMonitorRoutes } from './hermes/performance-monitor'
/**
* Register all routes on the Koa app.
@@ -72,6 +73,7 @@ export function registerRoutes(app: any, requireAuth: (ctx: Context, next: Next)
app.use(cronHistoryRoutes.routes()) // Must be before proxy
app.use(kanbanRoutes.routes()) // Must be before proxy
app.use(mediaRoutes.routes()) // Must be before proxy
app.use(performanceMonitorRoutes.routes()) // Must be before proxy
app.use(proxyRoutes.routes())
// Proxy catch-all middleware (must be last)
@@ -10,13 +10,16 @@ delimited JSON request/response protocol over a local socket.
from __future__ import annotations
import argparse
import atexit
import copy
import errno
import hashlib
import importlib.util
import json
import locale
import os
import queue
import signal
import shutil
import socket
import subprocess
@@ -38,12 +41,100 @@ DEFAULT_AGENT_ROOT = "~/.hermes/hermes-agent"
DEFAULT_HERMES_HOME = "~/.hermes"
APPROVAL_TIMEOUT_SECONDS = 120
APPROVAL_TIMEOUT_MS = APPROVAL_TIMEOUT_SECONDS * 1000
PARENT_WATCHDOG_INTERVAL_SECONDS = 2.0
def _bridge_platform() -> str:
return os.environ.get("HERMES_AGENT_BRIDGE_PLATFORM", "cli").strip() or "cli"
def _positive_int(value: str | None) -> int | None:
if not value:
return None
try:
parsed = int(value)
except (TypeError, ValueError):
return None
return parsed if parsed > 0 else None
def _process_exists(pid: int) -> bool:
if pid <= 0:
return False
if os.name == "nt":
try:
result = subprocess.run(
["tasklist.exe", "/FI", f"PID eq {pid}", "/NH"],
check=False,
capture_output=True,
text=True,
timeout=5,
)
return str(pid) in (result.stdout or "")
except Exception:
return True
try:
os.kill(pid, 0)
return True
except ProcessLookupError:
return False
except PermissionError:
return True
except OSError as exc:
return exc.errno != errno.ESRCH
def _start_parent_process_watchdog(
parent_pid: int | None,
stop_event: threading.Event,
label: str,
interval: float = PARENT_WATCHDOG_INTERVAL_SECONDS,
) -> None:
if not parent_pid or parent_pid == os.getpid():
return
def run() -> None:
while not stop_event.wait(interval):
if _process_exists(parent_pid):
continue
print(
f"[hermes-bridge] parent pid {parent_pid} exited; stopping {label}",
file=sys.stderr,
flush=True,
)
stop_event.set()
return
threading.Thread(target=run, daemon=True, name=f"hermes-bridge-parent-watchdog-{label}").start()
def _install_stop_signal_handlers(stop_event: threading.Event) -> Callable[[], None]:
if threading.current_thread() is not threading.main_thread():
return lambda: None
previous: list[tuple[signal.Signals, Any]] = []
def handle_signal(signum: int, _frame: Any) -> None:
stop_event.set()
for signum in (signal.SIGINT, signal.SIGTERM):
try:
sig = signal.Signals(signum)
previous.append((sig, signal.getsignal(sig)))
signal.signal(sig, handle_signal)
except Exception:
pass
def restore() -> None:
for sig, handler in previous:
try:
signal.signal(sig, handler)
except Exception:
pass
return restore
def _suppress_bridge_platform_hint() -> None:
raw = os.environ.get("HERMES_BRIDGE_SUPPRESS_PLATFORM_HINT", "cli").strip()
if raw.lower() in {"0", "false", "no", "off"}:
@@ -1452,12 +1543,18 @@ class BridgeServer:
raise ValueError("action is required")
if action == "ping":
with self.pool._lock:
sessions = list(self.pool._sessions.values())
running_sessions = sum(1 for session in sessions if session.running)
return {
"pong": True,
"time": time.time(),
"pid": os.getpid(),
"agent_root": str(_agent_root()),
"profile": _worker_profile() or "default",
"hermes_home": str(_hermes_home()),
"session_count": len(sessions),
"running_session_count": running_sessions,
}
if action == "chat":
@@ -1588,46 +1685,54 @@ class BridgeServer:
def serve_forever(self) -> None:
server = self._make_server_socket()
server.listen(16)
server.settimeout(0.2)
print(json.dumps({"event": "ready", "endpoint": self.endpoint}), flush=True)
restore_signals = _install_stop_signal_handlers(self._stop)
_start_parent_process_watchdog(
_positive_int(os.environ.get("HERMES_AGENT_BRIDGE_BROKER_PID")),
self._stop,
f"worker:{_worker_profile() or 'default'}",
)
try:
server.listen(16)
server.settimeout(0.2)
print(json.dumps({"event": "ready", "endpoint": self.endpoint}), flush=True)
while not self._stop.is_set():
conn: socket.socket | None = None
try:
while not self._stop.is_set():
conn: socket.socket | None = None
try:
conn, _addr = server.accept()
except socket.timeout:
self._gc_idle_sessions()
continue
try:
req = self._read_request(conn)
data = self.handle(req)
resp = {"ok": True, **_jsonable(data)}
except Exception as exc:
resp = {
"ok": False,
"error": str(exc),
"error_type": exc.__class__.__name__,
}
self._write_response(conn, resp)
except KeyboardInterrupt:
break
except Exception as exc:
print(f"[hermes-bridge] server loop error: {exc}", file=sys.stderr, flush=True)
finally:
if conn is not None:
try:
conn.close()
except OSError:
pass
server.close()
if self.endpoint.startswith("ipc://"):
try:
Path(self.endpoint.removeprefix("ipc://")).unlink(missing_ok=True)
except OSError:
pass
conn, _addr = server.accept()
except socket.timeout:
self._gc_idle_sessions()
continue
try:
req = self._read_request(conn)
data = self.handle(req)
resp = {"ok": True, **_jsonable(data)}
except Exception as exc:
resp = {
"ok": False,
"error": str(exc),
"error_type": exc.__class__.__name__,
}
self._write_response(conn, resp)
except KeyboardInterrupt:
break
except Exception as exc:
print(f"[hermes-bridge] server loop error: {exc}", file=sys.stderr, flush=True)
finally:
if conn is not None:
try:
conn.close()
except OSError:
pass
finally:
restore_signals()
server.close()
if self.endpoint.startswith("ipc://"):
try:
Path(self.endpoint.removeprefix("ipc://")).unlink(missing_ok=True)
except OSError:
pass
class WorkerProcess:
@@ -1647,6 +1752,10 @@ class WorkerProcess:
def running(self) -> bool:
return self.process is not None and self.process.poll() is None
@property
def pid(self) -> int | None:
return self.process.pid if self.process is not None else None
def start(self) -> None:
with self._lock:
if self.running:
@@ -1668,6 +1777,7 @@ class WorkerProcess:
**os.environ,
"HERMES_AGENT_BRIDGE_ENDPOINT": self.endpoint,
"HERMES_AGENT_BRIDGE_WORKER_PROFILE": self.profile,
"HERMES_AGENT_BRIDGE_BROKER_PID": str(os.getpid()),
}
self.process = subprocess.Popen(
args,
@@ -2019,6 +2129,18 @@ class BridgeBroker:
if event.get("event") in {"bridge.compression.completed", "bridge.compression.failed"} and request_id:
self._compression_profile.pop(request_id, None)
def stop(self) -> None:
self._stop.set()
with self._lock:
workers = list(self._workers.values())
self._workers.clear()
self._run_profile.clear()
self._session_profile.clear()
self._approval_profile.clear()
self._compression_profile.clear()
for worker in workers:
worker.stop()
def _forward(self, profile: str, req: dict[str, Any]) -> dict[str, Any]:
worker = self._worker_for_profile(profile)
forwarded = dict(req)
@@ -2034,8 +2156,33 @@ class BridgeBroker:
if action == "ping":
with self._lock:
workers = {profile: worker.running for profile, worker in self._workers.items()}
return {"pong": True, "time": time.time(), "mode": "broker", "workers": workers}
worker_details = {
profile: {
"running": worker.running,
"pid": worker.pid,
"endpoint": worker.endpoint,
"last_used_at": worker.last_used_at,
}
for profile, worker in self._workers.items()
}
workers = {profile: details["running"] for profile, details in worker_details.items()}
sessions_by_profile: dict[str, int] = {}
for profile in self._session_profile.values():
sessions_by_profile[profile] = sessions_by_profile.get(profile, 0) + 1
active_sessions = len(self._session_profile)
return {
"pong": True,
"time": time.time(),
"mode": "broker",
"broker": {
"pid": os.getpid(),
"endpoint": self.endpoint,
},
"workers": workers,
"worker_details": worker_details,
"active_sessions": active_sessions,
"sessions_by_profile": sessions_by_profile,
}
if action == "worker_ping":
profile = self._normalize_profile(req.get("profile"))
@@ -2145,17 +2292,7 @@ class BridgeBroker:
return {"sessions": sessions}
if action == "shutdown":
self._stop.set()
with self._lock:
workers = list(self._workers.values())
for worker in workers:
if not worker.running:
worker.stop()
continue
try:
worker.request({"action": "shutdown"})
except Exception:
worker.stop()
self.stop()
return {"status": "shutting_down"}
raise ValueError(f"unknown action: {action}")
@@ -2187,51 +2324,55 @@ class BridgeBroker:
def serve_forever(self) -> None:
server = self._make_server_socket()
server.listen(64)
server.settimeout(0.2)
print(json.dumps({"event": "ready", "endpoint": self.endpoint, "mode": "broker"}), flush=True)
restore_signals = _install_stop_signal_handlers(self._stop)
atexit.register(self.stop)
try:
server.listen(64)
server.settimeout(0.2)
print(json.dumps({"event": "ready", "endpoint": self.endpoint, "mode": "broker"}), flush=True)
while not self._stop.is_set():
conn: socket.socket | None = None
try:
while not self._stop.is_set():
conn: socket.socket | None = None
try:
conn, _addr = server.accept()
except socket.timeout:
self._gc_idle_workers()
continue
try:
req = self._read_request(conn)
data = self.handle(req)
resp = {"ok": True, **_jsonable(data)}
except Exception as exc:
resp = {
"ok": False,
"error": str(exc),
"error_type": exc.__class__.__name__,
}
self._write_response(conn, resp)
except KeyboardInterrupt:
break
except Exception as exc:
print(f"[hermes-bridge-broker] server loop error: {exc}", file=sys.stderr, flush=True)
finally:
if conn is not None:
try:
conn.close()
except OSError:
pass
with self._lock:
workers = list(self._workers.values())
self._workers.clear()
for worker in workers:
worker.stop()
server.close()
if self.endpoint.startswith("ipc://"):
conn, _addr = server.accept()
except socket.timeout:
self._gc_idle_workers()
continue
try:
req = self._read_request(conn)
data = self.handle(req)
resp = {"ok": True, **_jsonable(data)}
except Exception as exc:
resp = {
"ok": False,
"error": str(exc),
"error_type": exc.__class__.__name__,
}
self._write_response(conn, resp)
except KeyboardInterrupt:
break
except Exception as exc:
print(f"[hermes-bridge-broker] server loop error: {exc}", file=sys.stderr, flush=True)
finally:
if conn is not None:
try:
conn.close()
except OSError:
pass
finally:
restore_signals()
try:
Path(self.endpoint.removeprefix("ipc://")).unlink(missing_ok=True)
except OSError:
atexit.unregister(self.stop)
except Exception:
pass
self.stop()
server.close()
if self.endpoint.startswith("ipc://"):
try:
Path(self.endpoint.removeprefix("ipc://")).unlink(missing_ok=True)
except OSError:
pass
def main(argv: list[str] | None = None) -> int:
@@ -25,6 +25,17 @@ export interface BridgeCommand {
hermesHome: string
}
export interface AgentBridgeManagerRuntimeState {
endpoint: string
running: boolean
ready: boolean
pid?: number
starting: boolean
stopping: boolean
restartScheduled: boolean
restartAttempts: number
}
function envPositiveInt(name: string): number | undefined {
const raw = process.env[name]
if (!raw) return undefined
@@ -308,6 +319,19 @@ export class AgentBridgeManager {
return !!this.child && !this.child.killed && this.ready
}
getRuntimeState(): AgentBridgeManagerRuntimeState {
return {
endpoint: this.endpoint,
running: this.running,
ready: this.ready,
pid: this.child?.pid,
starting: !!this.starting,
stopping: this.stopping,
restartScheduled: !!this.restartTimer,
restartAttempts: this.restartAttempts,
}
}
async start(): Promise<void> {
if (this.running) return
if (this.starting) return this.starting
@@ -0,0 +1,551 @@
import { execFileSync } from 'child_process'
import { readFileSync } from 'fs'
import { cpus, freemem, loadavg, platform, totalmem, uptime } from 'os'
import { AgentBridgeClient } from './agent-bridge'
import { getAgentBridgeManager } from './agent-bridge/manager'
export interface ProcessUsage {
pid: number
role: 'web' | 'broker' | 'worker'
profile?: string
running: boolean
cpuPercent: number
memoryRssBytes: number
command?: string
error?: string
}
export interface OpsRuntimeSnapshot {
timestamp: number
system: {
platform: NodeJS.Platform
arch: string
uptimeSeconds: number
cpuCount: number
cpuPercent: number
loadAverage: number[]
totalMemoryBytes: number
freeMemoryBytes: number
usedMemoryBytes: number
memoryPercent: number
}
web: {
pid: number
uptimeSeconds: number
memory: NodeJS.MemoryUsage
cpuPercent: number
}
bridge: {
endpoint: string
reachable: boolean
error?: string
broker: {
running: boolean
ready: boolean
pid?: number
process?: ProcessUsage
restartScheduled: boolean
restartAttempts: number
}
workers: Array<ProcessUsage & {
endpoint?: string
lastUsedAt?: number
sessionCount: number
runningSessionCount: number
}>
totalWorkerMemoryRssBytes: number
}
sessions: {
active: number
running: number
byProfile: Record<string, number>
}
}
interface CpuTimesSample {
idle: number
total: number
}
interface WebCpuSample {
at: number
usage: NodeJS.CpuUsage
}
interface SystemMemoryUsage {
totalMemoryBytes: number
freeMemoryBytes: number
usedMemoryBytes: number
memoryPercent: number
}
let previousSystemCpu: CpuTimesSample | null = null
let previousWebCpu: WebCpuSample | null = null
function safeCpus(): ReturnType<typeof cpus> {
try {
return cpus()
} catch {
return []
}
}
function safeLoadAverage(): number[] {
try {
return loadavg()
} catch {
return [0, 0, 0]
}
}
function safeUptime(): number {
try {
return uptime()
} catch {
return 0
}
}
function safeProcessUptime(): number {
try {
return process.uptime()
} catch {
return 0
}
}
function safeProcessMemoryUsage(): NodeJS.MemoryUsage {
try {
return process.memoryUsage()
} catch {
return {
rss: 0,
heapTotal: 0,
heapUsed: 0,
external: 0,
arrayBuffers: 0,
}
}
}
function readCpuTimes(): CpuTimesSample {
let idle = 0
let total = 0
for (const cpu of safeCpus()) {
idle += cpu.times.idle
total += Object.values(cpu.times).reduce((sum, value) => sum + value, 0)
}
return { idle, total }
}
function sampleSystemCpuPercent(): number | null {
try {
const current = readCpuTimes()
const previous = previousSystemCpu
previousSystemCpu = current
if (!previous) return null
const idleDelta = current.idle - previous.idle
const totalDelta = current.total - previous.total
if (totalDelta <= 0) return null
return clampPercent(((totalDelta - idleDelta) / totalDelta) * 100)
} catch {
return null
}
}
function sampleWebCpuPercent(): number | null {
try {
const current = {
at: Date.now(),
usage: process.cpuUsage(),
}
const previous = previousWebCpu
previousWebCpu = current
if (!previous) return null
const elapsedMicros = (current.at - previous.at) * 1000
const used = (current.usage.user - previous.usage.user) + (current.usage.system - previous.usage.system)
if (elapsedMicros <= 0 || used < 0) return null
return clampPercent((used / elapsedMicros / Math.max(safeCpus().length, 1)) * 100)
} catch {
return null
}
}
function clampPercent(value: number): number {
return Math.max(0, Math.min(100, Math.round(value * 10) / 10))
}
function numberOrNull(value: unknown): number | null {
const parsed = Number(value)
return Number.isFinite(parsed) ? parsed : null
}
function fallbackSystemMemoryUsage(): SystemMemoryUsage {
let memoryTotal = 0
let memoryFree = 0
try {
memoryTotal = totalmem()
memoryFree = freemem()
} catch {}
const usedMemory = memoryTotal - memoryFree
return {
totalMemoryBytes: memoryTotal,
freeMemoryBytes: memoryFree,
usedMemoryBytes: usedMemory,
memoryPercent: memoryTotal > 0 ? clampPercent((usedMemory / memoryTotal) * 100) : 0,
}
}
function parseVmStatPageCount(line: string): number | null {
const match = line.match(/:\s+([\d.]+)\.?$/)
if (!match) return null
const value = Number(match[1].replace(/\./g, ''))
return Number.isFinite(value) ? value : null
}
export function parseMacVmStatMemory(vmStatOutput: string, totalMemoryBytes: number): SystemMemoryUsage | null {
const pageSize = Number(vmStatOutput.match(/page size of\s+(\d+)\s+bytes/i)?.[1])
if (!Number.isFinite(pageSize) || pageSize <= 0 || totalMemoryBytes <= 0) return null
const pages: Record<string, number> = {}
for (const line of vmStatOutput.split(/\r?\n/)) {
const count = parseVmStatPageCount(line.trim())
if (count == null) continue
if (line.includes('Pages active')) pages.active = count
else if (line.includes('Pages wired down')) pages.wired = count
else if (line.includes('Pages occupied by compressor')) pages.compressed = count
}
const usedPages = (pages.active || 0) + (pages.wired || 0) + (pages.compressed || 0)
if (usedPages <= 0) return null
const usedMemory = Math.min(totalMemoryBytes, usedPages * pageSize)
const freeMemory = Math.max(0, totalMemoryBytes - usedMemory)
return {
totalMemoryBytes,
freeMemoryBytes: freeMemory,
usedMemoryBytes: usedMemory,
memoryPercent: clampPercent((usedMemory / totalMemoryBytes) * 100),
}
}
function collectMacSystemMemoryUsage(): SystemMemoryUsage | null {
try {
const totalRaw = execFileSync('sysctl', ['-n', 'hw.memsize'], {
encoding: 'utf-8',
timeout: 3000,
}).trim()
const totalMemoryBytes = Number(totalRaw)
const vmStatOutput = execFileSync('vm_stat', {
encoding: 'utf-8',
timeout: 3000,
})
return parseMacVmStatMemory(vmStatOutput, totalMemoryBytes)
} catch {
return null
}
}
function collectSystemMemoryUsage(): SystemMemoryUsage {
if (platform() === 'darwin') {
return collectMacSystemMemoryUsage() || fallbackSystemMemoryUsage()
}
return fallbackSystemMemoryUsage()
}
function collectPosixProcessMetrics(pids: number[]): Map<number, Partial<ProcessUsage>> {
const metrics = collectProcfsProcessMetrics(pids)
if (!pids.length) return metrics
try {
const output = execFileSync('ps', ['-o', 'pid=,pcpu=,rss=,comm=', '-p', pids.join(',')], {
encoding: 'utf-8',
timeout: 3000,
})
for (const line of output.split(/\r?\n/)) {
const trimmed = line.trim()
if (!trimmed) continue
const [pidRaw, cpuRaw, rssRaw, ...commandParts] = trimmed.split(/\s+/)
const pid = Number(pidRaw)
if (!Number.isFinite(pid)) continue
const rssKb = numberOrNull(rssRaw)
metrics.set(pid, {
cpuPercent: numberOrNull(cpuRaw) ?? 0,
memoryRssBytes: rssKb == null ? metrics.get(pid)?.memoryRssBytes : rssKb * 1024,
command: commandParts.join(' ') || undefined,
})
}
return metrics
} catch {
return metrics
}
}
function collectProcfsProcessMetrics(pids: number[]): Map<number, Partial<ProcessUsage>> {
const metrics = new Map<number, Partial<ProcessUsage>>()
for (const pid of pids) {
try {
const status = readFileSync(`/proc/${pid}/status`, 'utf-8')
const rssKb = Number(status.match(/^VmRSS:\s+(\d+)\s+kB/im)?.[1])
const name = status.match(/^Name:\s+(.+)$/im)?.[1]?.trim()
metrics.set(pid, {
cpuPercent: 0,
memoryRssBytes: Number.isFinite(rssKb) ? rssKb * 1024 : 0,
command: name,
})
} catch {}
}
return metrics
}
function parseWindowsJson(output: string): any[] {
if (!output.trim()) return []
const parsed = JSON.parse(output)
return Array.isArray(parsed) ? parsed : [parsed]
}
function collectWindowsProcessMetrics(pids: number[]): Map<number, Partial<ProcessUsage>> {
if (!pids.length) return new Map()
const idList = pids.join(',')
try {
const script = [
`$ids=@(${idList})`,
'Get-CimInstance Win32_PerfFormattedData_PerfProc_Process',
'| Where-Object { $ids -contains [int]$_.IDProcess }',
'| Select-Object @{Name="pid";Expression={[int]$_.IDProcess}},@{Name="cpuPercent";Expression={[double]$_.PercentProcessorTime}},@{Name="memoryRssBytes";Expression={[double]$_.WorkingSet}},@{Name="command";Expression={$_.Name}}',
'| ConvertTo-Json -Compress',
].join(' ')
const output = execFileSync('powershell.exe', ['-NoProfile', '-Command', script], {
encoding: 'utf-8',
timeout: 5000,
windowsHide: true,
})
const metrics = new Map<number, Partial<ProcessUsage>>()
for (const item of parseWindowsJson(output)) {
const pid = Number(item?.pid)
if (!Number.isFinite(pid)) continue
metrics.set(pid, {
cpuPercent: numberOrNull(item?.cpuPercent) ?? 0,
memoryRssBytes: numberOrNull(item?.memoryRssBytes) ?? 0,
command: typeof item?.command === 'string' ? item.command : undefined,
})
}
return metrics
} catch {}
const metrics = new Map<number, Partial<ProcessUsage>>()
for (const pid of pids) {
try {
const output = execFileSync('tasklist.exe', ['/FI', `PID eq ${pid}`, '/FO', 'CSV', '/NH'], {
encoding: 'utf-8',
timeout: 3000,
windowsHide: true,
})
const line = output.split(/\r?\n/).find(item => item.includes(`"${pid}"`))
if (!line) continue
const columns = line.match(/(".*?"|[^",]+)(?=\s*,|\s*$)/g)?.map(value => value.replace(/^"|"$/g, '')) || []
const memoryKb = Number(columns[4]?.replace(/[^\d]/g, ''))
metrics.set(pid, {
cpuPercent: 0,
memoryRssBytes: Number.isFinite(memoryKb) ? memoryKb * 1024 : 0,
command: columns[0],
})
} catch {}
}
return metrics
}
function collectProcessMetrics(pids: number[]): Map<number, Partial<ProcessUsage>> {
const uniquePids = [...new Set(pids.filter(pid => Number.isFinite(pid) && pid > 0))]
return platform() === 'win32'
? collectWindowsProcessMetrics(uniquePids)
: collectPosixProcessMetrics(uniquePids)
}
function processUsage(
pid: number | undefined,
role: ProcessUsage['role'],
metrics: Map<number, Partial<ProcessUsage>>,
profile?: string,
): ProcessUsage | undefined {
if (!pid) return undefined
const metric = metrics.get(pid)
return {
pid,
role,
profile,
running: !!metric,
cpuPercent: metric?.cpuPercent ?? 0,
memoryRssBytes: metric?.memoryRssBytes ?? 0,
command: metric?.command,
}
}
function normalizeWorker(raw: unknown): {
running: boolean
pid?: number
endpoint?: string
lastUsedAt?: number
} {
if (typeof raw === 'boolean') return { running: raw }
if (!raw || typeof raw !== 'object') return { running: false }
const record = raw as Record<string, unknown>
const pid = Number(record.pid)
const lastUsedAt = Number(record.last_used_at)
return {
running: !!record.running,
pid: Number.isFinite(pid) && pid > 0 ? pid : undefined,
endpoint: typeof record.endpoint === 'string' ? record.endpoint : undefined,
lastUsedAt: Number.isFinite(lastUsedAt) ? lastUsedAt : undefined,
}
}
export function createEmptyOpsRuntimeSnapshot(error?: string): OpsRuntimeSnapshot {
return {
timestamp: Date.now(),
system: {
platform: process.platform,
arch: process.arch,
uptimeSeconds: safeUptime(),
cpuCount: safeCpus().length,
cpuPercent: 0,
loadAverage: safeLoadAverage(),
totalMemoryBytes: 0,
freeMemoryBytes: 0,
usedMemoryBytes: 0,
memoryPercent: 0,
},
web: {
pid: process.pid,
uptimeSeconds: safeProcessUptime(),
memory: safeProcessMemoryUsage(),
cpuPercent: 0,
},
bridge: {
endpoint: '',
reachable: false,
error,
broker: {
running: false,
ready: false,
restartScheduled: false,
restartAttempts: 0,
},
workers: [],
totalWorkerMemoryRssBytes: 0,
},
sessions: {
active: 0,
running: 0,
byProfile: {},
},
}
}
export async function getOpsRuntimeSnapshot(): Promise<OpsRuntimeSnapshot> {
const manager = getAgentBridgeManager()
const managerState = manager.getRuntimeState()
let bridgeReachable = false
let bridgeError: string | undefined
let bridgePing: Record<string, any> = {}
let sessions: Array<Record<string, any>> = []
try {
const client = new AgentBridgeClient({ endpoint: managerState.endpoint, timeoutMs: 2000, connectRetryMs: 0 })
bridgePing = await client.ping() as Record<string, any>
bridgeReachable = true
try {
const list = await client.list()
sessions = Array.isArray((list as any).sessions) ? (list as any).sessions : []
} catch {}
} catch (err: any) {
bridgeError = err?.message || 'Agent bridge is not reachable'
}
const workerEntries = Object.entries((bridgePing.worker_details || {}) as Record<string, unknown>)
.map(([profile, value]) => [profile, normalizeWorker(value)] as const)
const brokerPid = Number(bridgePing.broker?.pid || managerState.pid)
const pids = [
process.pid,
Number.isFinite(brokerPid) ? brokerPid : undefined,
...workerEntries.map(([, worker]) => worker.pid),
].filter((pid): pid is number => typeof pid === 'number' && pid > 0)
const processMetrics = collectProcessMetrics(pids)
const sessionCountsByProfile: Record<string, number> = {}
let runningSessions = 0
for (const session of sessions) {
const profileName = String(session.profile || 'default')
sessionCountsByProfile[profileName] = (sessionCountsByProfile[profileName] || 0) + 1
if (session.running) runningSessions += 1
}
if (!sessions.length && bridgePing.sessions_by_profile && typeof bridgePing.sessions_by_profile === 'object') {
for (const [profileName, count] of Object.entries(bridgePing.sessions_by_profile)) {
const value = Number(count)
if (Number.isFinite(value)) sessionCountsByProfile[profileName] = value
}
}
const workers = workerEntries.map(([profileName, worker]) => {
const usage = processUsage(worker.pid, 'worker', processMetrics, profileName)
return {
pid: worker.pid || 0,
role: 'worker' as const,
profile: profileName,
running: worker.running,
cpuPercent: usage?.cpuPercent ?? 0,
memoryRssBytes: usage?.memoryRssBytes ?? 0,
command: usage?.command,
endpoint: worker.endpoint,
lastUsedAt: worker.lastUsedAt,
sessionCount: sessionCountsByProfile[profileName] || 0,
runningSessionCount: sessions.filter(session => String(session.profile || 'default') === profileName && session.running).length,
}
})
const systemMemory = collectSystemMemoryUsage()
const totalWorkerMemory = workers.reduce((sum, worker) => sum + (worker.memoryRssBytes || 0), 0)
return {
timestamp: Date.now(),
system: {
platform: process.platform,
arch: process.arch,
uptimeSeconds: safeUptime(),
cpuCount: safeCpus().length,
cpuPercent: sampleSystemCpuPercent() ?? 0,
loadAverage: safeLoadAverage(),
totalMemoryBytes: systemMemory.totalMemoryBytes,
freeMemoryBytes: systemMemory.freeMemoryBytes,
usedMemoryBytes: systemMemory.usedMemoryBytes,
memoryPercent: systemMemory.memoryPercent,
},
web: {
pid: process.pid,
uptimeSeconds: safeProcessUptime(),
memory: safeProcessMemoryUsage(),
cpuPercent: sampleWebCpuPercent() ?? 0,
},
bridge: {
endpoint: managerState.endpoint,
reachable: bridgeReachable,
error: bridgeError,
broker: {
running: managerState.running,
ready: managerState.ready,
pid: Number.isFinite(brokerPid) && brokerPid > 0 ? brokerPid : undefined,
process: processUsage(Number.isFinite(brokerPid) ? brokerPid : undefined, 'broker', processMetrics),
restartScheduled: managerState.restartScheduled,
restartAttempts: managerState.restartAttempts,
},
workers,
totalWorkerMemoryRssBytes: totalWorkerMemory,
},
sessions: {
active: sessions.length || Number(bridgePing.active_sessions || 0),
running: runningSessions,
byProfile: sessionCountsByProfile,
},
}
}
@@ -393,6 +393,80 @@ assert calls == []
pool._run_context.session_id = "session-a"
assert pool._approval_dispatcher("cmd", "desc", allow_permanent=False) == "once"
assert calls == [("cmd", "desc", False)]
`)
})
it('cleans broker workers and wires worker parent watchdog state', () => {
runPython(String.raw`
${harness}
class FakeWorker:
def __init__(self):
self.running = True
self.stopped = False
def stop(self):
self.running = False
self.stopped = True
broker = bridge.BridgeBroker("ipc:///tmp/unused.sock")
worker = FakeWorker()
broker._workers["default"] = worker
broker._run_profile["run-a"] = "default"
broker._session_profile["session-a"] = "default"
broker._approval_profile["approval-a"] = "default"
broker._compression_profile["compression-a"] = "default"
broker.stop()
assert broker._stop.is_set()
assert worker.stopped
assert broker._workers == {}
assert broker._run_profile == {}
assert broker._session_profile == {}
assert broker._approval_profile == {}
assert broker._compression_profile == {}
created = {}
class FakeProcess:
stdout = None
stderr = None
def poll(self):
return None
def fake_popen(args, **kwargs):
created["args"] = args
created["env"] = kwargs["env"]
return FakeProcess()
original_popen = bridge.subprocess.Popen
original_getpid = bridge.os.getpid
try:
bridge.subprocess.Popen = fake_popen
bridge.os.getpid = lambda: 4242
proc_worker = bridge.WorkerProcess("default", "ipc:///tmp/worker.sock", "/agent", "/home")
proc_worker._pipe_stderr = lambda: None
proc_worker._wait_ready = lambda: None
proc_worker.start()
finally:
bridge.subprocess.Popen = original_popen
bridge.os.getpid = original_getpid
assert created["env"]["HERMES_AGENT_BRIDGE_BROKER_PID"] == "4242"
assert created["env"]["HERMES_AGENT_BRIDGE_WORKER_PROFILE"] == "default"
stop_event = threading.Event()
seen_pids = []
original_process_exists = bridge._process_exists
try:
bridge._process_exists = lambda pid: seen_pids.append(pid) and False
bridge._start_parent_process_watchdog(12345, stop_event, "test", interval=0.01)
assert wait_for(stop_event.is_set, timeout=2)
finally:
bridge._process_exists = original_process_exists
assert seen_pids == [12345]
`)
})
})
@@ -0,0 +1,40 @@
import { afterEach, describe, expect, it, vi } from 'vitest'
const getOpsRuntimeSnapshot = vi.fn()
vi.mock('../../packages/server/src/services/hermes/ops-monitor', () => ({
createEmptyOpsRuntimeSnapshot: (error?: string) => ({ timestamp: 0, error }),
getOpsRuntimeSnapshot,
}))
describe('performance monitor controller', () => {
afterEach(() => {
vi.clearAllMocks()
})
it('returns the runtime snapshot from the performance service', async () => {
const snapshot = {
timestamp: 1,
bridge: { workers: [] },
sessions: { active: 0 },
}
getOpsRuntimeSnapshot.mockResolvedValue(snapshot)
const ctx: any = {}
const { runtime } = await import('../../packages/server/src/controllers/hermes/performance-monitor')
await runtime(ctx)
expect(ctx.body).toBe(snapshot)
})
it('returns a zero snapshot when metrics collection fails', async () => {
getOpsRuntimeSnapshot.mockRejectedValue(new Error('boom'))
const ctx: any = {}
const { runtime } = await import('../../packages/server/src/controllers/hermes/performance-monitor')
await runtime(ctx)
expect(ctx.status).toBeUndefined()
expect(ctx.body).toEqual({ timestamp: 0, error: 'boom' })
})
})