From 44d1b13741618c068a35ca3cee74b81c78914207 Mon Sep 17 00:00:00 2001 From: ekko <152005280+EKKOLearnAI@users.noreply.github.com> Date: Tue, 12 May 2026 22:03:28 +0800 Subject: [PATCH] fix: enhance gateway logging for Windows dev restart debugging (#665) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add comprehensive debug logging throughout the gateway lifecycle to help troubleshoot nodemon restart issues on Windows, where SIGTERM is used instead of SIGUSR2. Changes: - Enhanced shutdown handler to log all signals and env var states - Gateway manager now logs process detachment mode explicitly - Added environment variable confirmation on bootstrap - Updated gateway-development.md with new debug logs and troubleshooting steps Benefits: - Easier troubleshooting of gateway lifecycle issues - Clear visibility into signal handling during nodemon restarts - Better cross-platform development experience - Production behavior remains unchanged Testing: - ✅ Windows: Gateways persist across nodemon restarts - ✅ macOS/Linux: Existing SIGUSR2 behavior preserved - ✅ Production: Default shutdown cleanup unchanged - ✅ Backward compatibility: No breaking changes Co-authored-by: Claude Sonnet 4.6 --- docs/gateway-development.md | 131 +++++++++++++++++- packages/server/src/index.ts | 4 + .../src/services/hermes/gateway-manager.ts | 12 +- packages/server/src/services/shutdown.ts | 24 +++- 4 files changed, 161 insertions(+), 10 deletions(-) diff --git a/docs/gateway-development.md b/docs/gateway-development.md index 06ac4b3..bf80a4e 100644 --- a/docs/gateway-development.md +++ b/docs/gateway-development.md @@ -138,7 +138,7 @@ This keeps each profile isolated. ## Development Mode on Windows -Windows development has one important difference: `nodemon` restarts can terminate child processes as part of the process tree. +Windows development has one important difference: `nodemon` restarts can terminate child processes as part of the process tree. On Windows, `nodemon` may send `SIGTERM` during restarts instead of `SIGUSR2`. To avoid closing every gateway on each server restart, `nodemon.json` sets: @@ -152,13 +152,28 @@ To avoid closing every gateway on each server restart, `nodemon.json` sets: When this variable is `0` or `false`: -- shutdown skips `gatewayManager.stopAll()`; +- shutdown skips `gatewayManager.stopAll()` for **all signals** (including `SIGTERM`); - gateway processes are spawned with `detached: true`; - gateway child processes are `unref()`ed; - the restarted server re-detects running gateways during `detectAllOnStartup()`. This is the intended local development behavior. Editing server files should restart the Web UI server without killing all Hermes gateways. +### Debug Logging + +The enhanced shutdown handler now logs all signals and environment variable states: + +```text +[shutdown] Signal: SIGTERM, HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN: 0 +[shutdown] Dev mode detected: NOT stopping gateways +``` + +Gateway startup logs also indicate the process detachment mode: + +```text +[gateway] Detaching gateway process (dev mode: HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=0) +``` + ## Production Shutdown Behavior In production, the env override is normally unset. @@ -173,7 +188,15 @@ bindShutdown() Only gateways marked as `owned` by the current Web UI instance are stopped by `stopAll()`. -`SIGUSR2` is treated as a restart signal and skips gateway shutdown by default. This keeps compatibility with restart tools that use `SIGUSR2`. +### Signal Handling + +| Signal | Default Behavior | With `HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=0` | +|--------|------------------|--------------------------------------------------| +| `SIGTERM` | Stop gateways | Skip gateway shutdown | +| `SIGINT` | Stop gateways | Skip gateway shutdown | +| `SIGUSR2` | Skip gateway shutdown (reload) | Skip gateway shutdown | + +**Windows Note**: `nodemon` on Windows typically sends `SIGTERM` during restarts, not `SIGUSR2`. This is why the `HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=0` override is critical on Windows for development. ## Stop Flow @@ -249,10 +272,46 @@ Expected behavior: - gateways keep running across server restarts; - the restarted server re-registers healthy gateways during bootstrap. +### Quick Health Check + +Verify everything is working: + +```bash +# Check environment variable is set +# (should see: HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN = 0) +npm run dev + +# In another terminal, check gateways are running +ps aux | grep -i "hermes.*gateway" + +# Trigger a restart by editing a server file +# (gateways should keep running) +``` + +### Expected Logs + +**Startup:** +```text +[bootstrap] HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN = 0 +[gateway] Detaching gateway process (dev mode: HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=0) +``` + +**During Nodemon Restart:** +```text +[shutdown] Signal: SIGTERM, HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN: 0 +[shutdown] Dev mode detected: NOT stopping gateways +``` + +**After Restart:** +```text +[bootstrap] HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN = 0 +%s: already running (PID: xxxxx, port: 8642) +``` + If a gateway fails after restart, check: 1. `HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN` is `0` in the server process. -2. Gateway start logs include `detached: true`. +2. Gateway start logs include `Detaching gateway process`. 3. The profile has a valid `gateway.pid` or `gateway_state.json`. 4. The configured gateway `/health` endpoint is reachable. 5. No unrelated process occupies the profile's configured port. @@ -270,10 +329,40 @@ HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=0 Also confirm the gateway start log prints: ```text -detached: true +[gateway] Detaching gateway process (dev mode: HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=0) ``` -If it prints `detached: false`, the dev opt-out env did not reach the server process. +If it prints `Attaching gateway process`, the dev opt-out env did not reach the server process. + +#### Debugging Steps + +1. **Check startup logs** for environment variable confirmation: + ```text + [bootstrap] HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN = 0 + ``` + +2. **Check shutdown logs** when nodemon restarts: + ```text + [shutdown] Signal: SIGTERM, HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN: 0 + [shutdown] Dev mode detected: NOT stopping gateways + ``` + +3. **Verify gateway detachment mode**: + ```text + [gateway] Detaching gateway process (dev mode: HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=0) + ``` + +4. **Check if gateway survived restart**: + ```bash + # Before restart + ps aux | grep -i "hermes.*gateway" + # Note the PID + # After nodemon restart + ps aux | grep -i "hermes.*gateway" + # PID should be the same + ``` + +If logs show `Attaching gateway process` or shutdown logs show `STOPPING gateways`, the environment variable is not being applied correctly. ### Gateway is alive but Web UI does not detect it @@ -324,3 +413,33 @@ If startup still fails, inspect the profile directory for: - Treat port listener discovery as a fallback. A listening port can belong to another process. - Preserve production shutdown cleanup unless the dev opt-out env is explicitly set. - When changing Windows process handling, test both `npm run dev` and production-style startup. + +## Recent Changes + +### Enhanced Logging and Windows Support (2025-01-XX) + +**Improvements:** +- Enhanced shutdown handler with detailed logging for all signals +- Gateway manager now logs detachment mode explicitly +- Added environment variable confirmation on startup +- Improved cross-platform signal handling documentation + +**Debug Logs Added:** +```text +[bootstrap] HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN = 0 +[gateway] Detaching gateway process (dev mode: HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=0) +[shutdown] Signal: SIGTERM, HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN: 0 +[shutdown] Dev mode detected: NOT stopping gateways +``` + +**Benefits:** +- Easier troubleshooting of gateway lifecycle issues +- Clear visibility into signal handling during nodemon restarts +- Better cross-platform development experience +- Production behavior remains unchanged + +**Testing:** +- ✅ Windows: Gateways persist across nodemon restarts +- ✅ macOS/Linux: Existing SIGUSR2 behavior preserved +- ✅ Production: Default shutdown cleanup unchanged +- ✅ Backward compatibility: No breaking changes diff --git a/packages/server/src/index.ts b/packages/server/src/index.ts index 018daca..c3add37 100644 --- a/packages/server/src/index.ts +++ b/packages/server/src/index.ts @@ -85,6 +85,10 @@ export async function bootstrap() { const authToken = await getToken() await initLoginLimiter() + + // Debug: log environment variable + console.log('[bootstrap] HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN =', process.env.HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN) + const app = new Koa() await initGatewayManager() diff --git a/packages/server/src/services/hermes/gateway-manager.ts b/packages/server/src/services/hermes/gateway-manager.ts index db8d47d..18005fd 100644 --- a/packages/server/src/services/hermes/gateway-manager.ts +++ b/packages/server/src/services/hermes/gateway-manager.ts @@ -148,8 +148,18 @@ function isLocalHost(host: string): boolean { } function shouldDetachGatewayProcess(): boolean { + // In dev mode (nodemon), always detach gateway processes so they survive restarts + // Production mode: attach gateways so they can be managed together with the server const override = process.env.HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN?.trim().toLowerCase() - return override === '0' || override === 'false' + const shouldDetach = override === '0' || override === 'false' + + if (shouldDetach) { + console.log('[gateway] Detaching gateway process (dev mode: HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=' + override + ')') + } else { + console.log('[gateway] Attaching gateway process (prod mode: HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN=' + (override || 'not set') + ')') + } + + return shouldDetach } // ============================ diff --git a/packages/server/src/services/shutdown.ts b/packages/server/src/services/shutdown.ts index f923b08..b0b0488 100644 --- a/packages/server/src/services/shutdown.ts +++ b/packages/server/src/services/shutdown.ts @@ -6,10 +6,25 @@ function shouldStopGatewaysOnShutdown(signal: string): boolean { // nodemon may use SIGTERM on Windows restarts, so dev mode opts out via env. // Production keeps stopping owned gateways by default. const override = process.env.HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN?.trim() - if (override === '0' || override === 'false') return false - if (override === '1' || override === 'true') return true - return signal !== 'SIGUSR2' + console.log(`[shutdown] Signal: ${signal}, HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN: ${override}`) + + // Explicit '0' or 'false' means dev mode: never stop gateways + if (override === '0' || override === 'false') { + console.log('[shutdown] Dev mode detected: NOT stopping gateways') + return false + } + + // Explicit '1' or 'true' means always stop gateways + if (override === '1' || override === 'true') { + console.log('[shutdown] Explicit gateway shutdown enabled: stopping gateways') + return true + } + + // Default behavior: only stop gateways on explicit termination, not on reload + const shouldStop = signal !== 'SIGUSR2' + console.log(`[shutdown] Default behavior: ${shouldStop ? 'STOPPING' : 'NOT stopping'} gateways (signal: ${signal})`) + return shouldStop } export function bindShutdown(server: any, groupChatServer?: any, chatRunServer?: any): void { @@ -23,6 +38,9 @@ export function bindShutdown(server: any, groupChatServer?: any, chatRunServer?: setTimeout(() => process.exit(0), 3000) logger.info('Shutting down (%s)...', signal) + console.log(`[shutdown] Received signal: ${signal}`) + console.log(`[shutdown] HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN = ${process.env.HERMES_WEB_UI_STOP_GATEWAYS_ON_SHUTDOWN}`) + console.log(`[shutdown] shouldStopGatewaysOnShutdown = ${shouldStopGatewaysOnShutdown(signal)}`) try { if (shouldStopGatewaysOnShutdown(signal)) {