Fix bridge history, profile models, and Windows gateway handling (#845)

* feat: support profile-aware group chat bridge flows

* feat: route cron jobs through hermes cli

* Fix group chat routing and isolate bridge tests

* Add Grok image-to-video media skill

* Default Grok videos to media directory

* Fix bridge profile fallback and cron repeat clearing

* Refine bridge chat and gateway platform handling

* Filter bridge tool-call text deltas

* Preserve structured bridge chat history

* Prepare beta release build artifacts

* Fix Windows run profile resolution

* Fix Windows path compatibility checks

* Fix profile-scoped model page display

* Hide Windows subprocess windows for jobs and updates

* Hide Windows file backend subprocess windows

* Avoid Windows gateway restart lock conflicts

* Treat Windows gateway lock as running on startup

* Force release Windows gateway lock on restart

* Tighten Windows gateway lock cleanup

* Update chat e2e source expectation

* Bump package version to 0.5.30

---------

Co-authored-by: Codex <codex@openai.com>
This commit is contained in:
ekko
2026-05-19 16:09:59 +08:00
committed by GitHub
parent 3d74d78698
commit 9a9416c99c
129 changed files with 7017 additions and 1838 deletions
@@ -11,13 +11,16 @@ from __future__ import annotations
import argparse
import copy
import hashlib
import importlib.util
import json
import os
import queue
import shutil
import socket
import subprocess
import sys
import tempfile
import threading
import time
import traceback
@@ -174,6 +177,11 @@ def _base_hermes_home() -> Path:
return _normalize_base_home(_discover_hermes_home(os.environ.get("HERMES_AGENT_BRIDGE_BASE_HOME") or DEFAULT_HERMES_HOME))
def _worker_profile() -> str | None:
raw = os.environ.get("HERMES_AGENT_BRIDGE_WORKER_PROFILE", "").strip()
return raw or None
def _profile_home(profile: str | None) -> Path:
base = _base_hermes_home()
if not profile or profile == "default":
@@ -319,8 +327,20 @@ def _restore_profile_dotenv(snapshot: dict[str, str | None]) -> None:
os.environ[key] = value
def _set_worker_profile_env(profile: str | None) -> None:
profile_home = _profile_home(profile)
os.environ["HERMES_HOME"] = str(profile_home)
os.environ["HERMES_AGENT_BRIDGE_WORKER_PROFILE"] = profile or "default"
values = _read_dotenv(profile_home / ".env")
for key, value in values.items():
os.environ[key] = value
@contextmanager
def _profile_env(profile: str | None):
if _worker_profile():
yield
return
original = _apply_profile_env(profile)
env_snapshot = _apply_profile_dotenv(profile)
try:
@@ -832,6 +852,7 @@ class AgentPool:
storage_message: Any | None,
conversation_history: list[dict[str, Any]] | None,
profile: str | None,
source: str | None = None,
) -> bool:
persist_message = storage_message if storage_message is not None else message
user_content = str(persist_message) if not isinstance(persist_message, dict) else str(persist_message.get("content", persist_message))
@@ -848,7 +869,7 @@ class AgentPool:
if hasattr(db, "create_session"):
db.create_session(
session_id=session.session_id,
source=_bridge_platform(),
source=source or _bridge_platform(),
model=session.config.get("model"),
)
@@ -958,6 +979,7 @@ class AgentPool:
force_compress: bool = False,
model: str | None = None,
provider: str | None = None,
source: str | None = None,
) -> RunRecord:
session = self.get_or_create(session_id, profile=profile, model=model, provider=provider)
with session.lock:
@@ -973,14 +995,14 @@ class AgentPool:
thread = threading.Thread(
target=self._run_chat,
args=(session, record, message, storage_message, instructions, conversation_history, profile, force_compress),
args=(session, record, message, storage_message, instructions, conversation_history, profile, force_compress, source),
daemon=True,
name=f"hermes-bridge-run-{run_id[:8]}",
)
thread.start()
return record
def _run_chat(self, session: AgentSession, record: RunRecord, message: Any, storage_message: Any | None = None, instructions: str | None = None, conversation_history: list[dict[str, Any]] | None = None, profile: str | None = None, force_compress: bool = False) -> None:
def _run_chat(self, session: AgentSession, record: RunRecord, message: Any, storage_message: Any | None = None, instructions: str | None = None, conversation_history: list[dict[str, Any]] | None = None, profile: str | None = None, force_compress: bool = False, source: str | None = None) -> None:
with self._run_lock:
with _profile_env(profile):
def stream_callback(delta: str) -> None:
@@ -1004,7 +1026,7 @@ class AgentPool:
os.environ["HERMES_EXEC_ASK"] = "1"
except Exception:
previous_approval_callback = None
self._prepersist_user_message(session, message, storage_message, conversation_history, profile)
self._prepersist_user_message(session, message, storage_message, conversation_history, profile, source)
db_count_after_prepersist = self._session_db_message_count(session.session_id, profile)
if force_compress:
compress = getattr(session.agent, "_compress_context", None)
@@ -1265,7 +1287,13 @@ class BridgeServer:
raise ValueError("action is required")
if action == "ping":
return {"pong": True, "time": time.time(), "agent_root": str(_agent_root())}
return {
"pong": True,
"time": time.time(),
"agent_root": str(_agent_root()),
"profile": _worker_profile() or "default",
"hermes_home": str(_hermes_home()),
}
if action == "chat":
session_id = str(req.get("session_id") or "").strip() or uuid.uuid4().hex
@@ -1276,6 +1304,7 @@ class BridgeServer:
profile = req.get("profile")
model = req.get("model")
provider = req.get("provider")
source = req.get("source")
record = self.pool.start_chat(
session_id,
message,
@@ -1286,6 +1315,7 @@ class BridgeServer:
bool(req.get("force_compress")),
model,
provider,
source,
)
if req.get("wait"):
timeout = float(req.get("timeout", 0) or 0)
@@ -1355,50 +1385,13 @@ class BridgeServer:
raise ValueError(f"unknown action: {action}")
def _make_server_socket(self) -> socket.socket:
if self.endpoint.startswith("ipc://"):
if not hasattr(socket, "AF_UNIX"):
raise RuntimeError("ipc:// endpoints require Unix domain socket support; use tcp://host:port on this platform")
sock_path = Path(self.endpoint.removeprefix("ipc://"))
sock_path.parent.mkdir(parents=True, exist_ok=True)
try:
sock_path.unlink(missing_ok=True)
except OSError:
pass
server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
server.bind(str(sock_path))
return server
parsed = urlparse(self.endpoint)
if parsed.scheme != "tcp":
raise RuntimeError(f"unsupported endpoint scheme: {self.endpoint}")
host = parsed.hostname or "127.0.0.1"
port = int(parsed.port or 0)
if port <= 0:
raise RuntimeError(f"tcp endpoint requires a port: {self.endpoint}")
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.bind((host, port))
return server
return _make_listen_socket(self.endpoint)
def _read_request(self, conn: socket.socket) -> dict[str, Any]:
chunks: list[bytes] = []
while True:
chunk = conn.recv(65536)
if not chunk:
break
chunks.append(chunk)
if b"\n" in chunk:
break
if not chunks:
raise RuntimeError("empty request")
line = b"".join(chunks).split(b"\n", 1)[0].strip()
if not line:
raise RuntimeError("empty request")
return json.loads(line.decode("utf-8"))
return _read_json_request(conn)
def _write_response(self, conn: socket.socket, resp: dict[str, Any]) -> None:
payload = json.dumps(resp, ensure_ascii=False, default=str) + "\n"
conn.sendall(payload.encode("utf-8"))
_write_json_response(conn, resp)
def _gc_idle_sessions(self) -> None:
"""Destroy sessions idle longer than IDLE_TIMEOUT_SECONDS."""
@@ -1458,16 +1451,530 @@ class BridgeServer:
pass
class WorkerProcess:
STARTUP_TIMEOUT_SECONDS = 120
REQUEST_TIMEOUT_SECONDS = 120
def __init__(self, profile: str, endpoint: str, agent_root: str | None, hermes_home: str | None) -> None:
self.profile = profile or "default"
self.endpoint = endpoint
self.agent_root = agent_root
self.hermes_home = hermes_home
self.process: subprocess.Popen[str] | None = None
self.last_used_at = time.time()
self._lock = threading.RLock()
@property
def running(self) -> bool:
return self.process is not None and self.process.poll() is None
def start(self) -> None:
with self._lock:
if self.running:
return
args = [
sys.executable,
str(Path(__file__).resolve()),
"--endpoint",
self.endpoint,
"--worker-profile",
self.profile,
]
if self.agent_root:
args.extend(["--agent-root", self.agent_root])
if self.hermes_home:
args.extend(["--hermes-home", self.hermes_home])
env = {
**os.environ,
"HERMES_AGENT_BRIDGE_ENDPOINT": self.endpoint,
"HERMES_AGENT_BRIDGE_WORKER_PROFILE": self.profile,
}
self.process = subprocess.Popen(
args,
env=env,
cwd=os.getcwd(),
stdin=subprocess.DEVNULL,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1,
)
self._pipe_stderr()
self._wait_ready()
def _pipe_stderr(self) -> None:
proc = self.process
if proc is None or proc.stderr is None:
return
def run() -> None:
assert proc.stderr is not None
for line in proc.stderr:
text = line.rstrip()
if text:
print(f"[hermes-bridge-worker:{self.profile}] {text}", file=sys.stderr, flush=True)
threading.Thread(target=run, daemon=True, name=f"hermes-bridge-worker-stderr-{self.profile}").start()
def _wait_ready(self) -> None:
proc = self.process
if proc is None or proc.stdout is None:
raise RuntimeError(f"profile worker {self.profile} did not start")
lines: queue.Queue[str | None] = queue.Queue()
ready_event = threading.Event()
def read_stdout() -> None:
assert proc.stdout is not None
try:
for line in proc.stdout:
if ready_event.is_set():
text = line.rstrip()
if text:
print(f"[hermes-bridge-worker:{self.profile}] {text}", file=sys.stderr, flush=True)
else:
lines.put(line)
finally:
lines.put(None)
threading.Thread(target=read_stdout, daemon=True, name=f"hermes-bridge-worker-stdout-{self.profile}").start()
deadline = time.time() + self.STARTUP_TIMEOUT_SECONDS
while time.time() < deadline:
if proc.poll() is not None:
raise RuntimeError(f"profile worker {self.profile} exited before ready")
try:
line = lines.get(timeout=0.1)
except queue.Empty:
continue
if line is None:
time.sleep(0.05)
continue
text = line.strip()
if text:
print(f"[hermes-bridge-worker:{self.profile}] {text}", file=sys.stderr, flush=True)
try:
data = json.loads(text)
if data.get("event") == "ready":
ready_event.set()
return
except Exception:
pass
self.stop()
raise RuntimeError(f"profile worker {self.profile} did not become ready within {self.STARTUP_TIMEOUT_SECONDS}s")
def stop(self) -> None:
with self._lock:
proc = self.process
self.process = None
if proc is None:
return
if proc.poll() is None:
proc.terminate()
try:
proc.wait(timeout=3)
except subprocess.TimeoutExpired:
proc.kill()
proc.wait(timeout=3)
if self.endpoint.startswith("ipc://"):
try:
Path(self.endpoint.removeprefix("ipc://")).unlink(missing_ok=True)
except OSError:
pass
def request(self, req: dict[str, Any]) -> dict[str, Any]:
self.start()
self.last_used_at = time.time()
return _send_bridge_request(self.endpoint, req, self.REQUEST_TIMEOUT_SECONDS)
def _worker_endpoint(profile: str) -> str:
safe = hashlib.sha256(profile.encode("utf-8")).hexdigest()[:16]
if os.name == "nt":
port_base = int(os.environ.get("HERMES_AGENT_BRIDGE_WORKER_PORT_BASE", "18780"))
return f"tcp://127.0.0.1:{port_base + int(safe[:4], 16) % 1000}"
root = Path(tempfile.gettempdir()) / "hermes-agent-bridge-workers"
return f"ipc://{root / f'{safe}.sock'}"
def _connect_bridge_socket(endpoint: str, timeout: float) -> socket.socket:
if endpoint.startswith("ipc://"):
sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.connect(endpoint.removeprefix("ipc://"))
return sock
parsed = urlparse(endpoint)
if parsed.scheme != "tcp":
raise RuntimeError(f"unsupported endpoint scheme: {endpoint}")
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(timeout)
sock.connect((parsed.hostname or "127.0.0.1", int(parsed.port or 0)))
return sock
def _send_bridge_request(endpoint: str, req: dict[str, Any], timeout: float) -> dict[str, Any]:
sock = _connect_bridge_socket(endpoint, timeout)
try:
sock.sendall((json.dumps(req, ensure_ascii=False, default=str) + "\n").encode("utf-8"))
chunks: list[bytes] = []
while True:
chunk = sock.recv(65536)
if not chunk:
break
chunks.append(chunk)
if b"\n" in chunk:
break
line = b"".join(chunks).split(b"\n", 1)[0].strip()
if not line:
raise RuntimeError("worker closed without a response")
resp = json.loads(line.decode("utf-8"))
if not resp.get("ok"):
raise RuntimeError(str(resp.get("error") or "worker request failed"))
return resp
finally:
try:
sock.close()
except OSError:
pass
def _make_listen_socket(endpoint: str) -> socket.socket:
if endpoint.startswith("ipc://"):
if not hasattr(socket, "AF_UNIX"):
raise RuntimeError("ipc:// endpoints require Unix domain socket support; use tcp://host:port on this platform")
sock_path = Path(endpoint.removeprefix("ipc://"))
sock_path.parent.mkdir(parents=True, exist_ok=True)
try:
sock_path.unlink(missing_ok=True)
except OSError:
pass
server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
server.bind(str(sock_path))
return server
parsed = urlparse(endpoint)
if parsed.scheme != "tcp":
raise RuntimeError(f"unsupported endpoint scheme: {endpoint}")
host = parsed.hostname or "127.0.0.1"
port = int(parsed.port or 0)
if port <= 0:
raise RuntimeError(f"tcp endpoint requires a port: {endpoint}")
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.bind((host, port))
return server
def _read_json_request(conn: socket.socket) -> dict[str, Any]:
chunks: list[bytes] = []
while True:
chunk = conn.recv(65536)
if not chunk:
break
chunks.append(chunk)
if b"\n" in chunk:
break
if not chunks:
raise RuntimeError("empty request")
line = b"".join(chunks).split(b"\n", 1)[0].strip()
if not line:
raise RuntimeError("empty request")
return json.loads(line.decode("utf-8"))
def _write_json_response(conn: socket.socket, resp: dict[str, Any]) -> None:
payload = json.dumps(resp, ensure_ascii=False, default=str) + "\n"
conn.sendall(payload.encode("utf-8"))
class BridgeBroker:
IDLE_TIMEOUT_SECONDS = 30 * 60
GC_INTERVAL_SECONDS = 60
def __init__(self, endpoint: str, agent_root: str | None = None, hermes_home: str | None = None) -> None:
self.endpoint = endpoint
self.agent_root = agent_root
self.hermes_home = hermes_home
self._workers: dict[str, WorkerProcess] = {}
self._run_profile: dict[str, str] = {}
self._session_profile: dict[str, str] = {}
self._approval_profile: dict[str, str] = {}
self._compression_profile: dict[str, str] = {}
self._lock = threading.RLock()
self._stop = threading.Event()
self._last_gc = time.time()
def _normalize_profile(self, value: Any) -> str:
profile = str(value or "").strip()
return profile or "default"
def _worker_for_profile(self, profile: str) -> WorkerProcess:
profile = self._normalize_profile(profile)
with self._lock:
worker = self._workers.get(profile)
if worker is None:
worker = WorkerProcess(profile, _worker_endpoint(profile), self.agent_root, self.hermes_home)
self._workers[profile] = worker
return worker
def _profile_for_run(self, run_id: str) -> str:
with self._lock:
profile = self._run_profile.get(run_id)
if not profile:
raise KeyError(f"unknown run: {run_id}")
return profile
def _profile_for_session(self, session_id: str, fallback_profile: Any = None) -> str:
with self._lock:
profile = self._session_profile.get(session_id)
if not profile:
fallback = self._normalize_profile(fallback_profile)
if fallback_profile is not None and fallback:
return fallback
raise KeyError(f"unknown session: {session_id}")
return profile
def _record_response_routes(self, profile: str, resp: dict[str, Any]) -> None:
run_id = str(resp.get("run_id") or "")
session_id = str(resp.get("session_id") or "")
with self._lock:
if run_id:
self._run_profile[run_id] = profile
if session_id:
self._session_profile[session_id] = profile
for event in resp.get("events") or []:
if not isinstance(event, dict):
continue
approval_id = str(event.get("approval_id") or "")
if approval_id:
self._approval_profile[approval_id] = profile
request_id = str(event.get("request_id") or "")
if event.get("event") == "bridge.compression.requested" and request_id:
self._compression_profile[request_id] = profile
if event.get("event") in {"bridge.compression.completed", "bridge.compression.failed"} and request_id:
self._compression_profile.pop(request_id, None)
def _forward(self, profile: str, req: dict[str, Any]) -> dict[str, Any]:
worker = self._worker_for_profile(profile)
forwarded = dict(req)
forwarded["profile"] = profile
resp = worker.request(forwarded)
self._record_response_routes(profile, resp)
return resp
def handle(self, req: dict[str, Any]) -> dict[str, Any]:
action = str(req.get("action") or "").strip()
if not action:
raise ValueError("action is required")
if action == "ping":
with self._lock:
workers = {profile: worker.running for profile, worker in self._workers.items()}
return {"pong": True, "time": time.time(), "mode": "broker", "workers": workers}
if action == "worker_ping":
profile = self._normalize_profile(req.get("profile"))
resp = self._forward(profile, {"action": "ping"})
resp["worker_profile"] = profile
return resp
if action == "chat":
profile = self._normalize_profile(req.get("profile"))
return self._forward(profile, req)
if action in {"get_result", "get_output"}:
profile = self._profile_for_run(str(req.get("run_id") or ""))
return self._forward(profile, req)
if action in {"interrupt", "steer", "get_history", "destroy"}:
session_id = str(req.get("session_id") or "")
profile = self._profile_for_session(session_id, req.get("profile"))
resp = self._forward(profile, req)
if action == "destroy":
with self._lock:
self._session_profile.pop(session_id, None)
return resp
if action == "approval_respond":
approval_id = str(req.get("approval_id") or "").strip()
if not approval_id:
raise ValueError("approval_id is required")
with self._lock:
profile = self._approval_profile.get(approval_id)
if not profile:
raise KeyError(f"unknown approval request: {approval_id}")
return self._forward(profile, req)
if action == "compression_respond":
request_id = str(req.get("request_id") or "").strip()
if not request_id:
raise ValueError("request_id is required")
with self._lock:
profile = self._compression_profile.get(request_id)
if not profile:
raise KeyError(f"unknown compression request: {request_id}")
return self._forward(profile, req)
if action == "destroy_all":
with self._lock:
workers = list(self._workers.values())
self._run_profile.clear()
self._session_profile.clear()
self._approval_profile.clear()
self._compression_profile.clear()
destroyed = 0
for worker in workers:
if not worker.running:
worker.stop()
continue
try:
resp = worker.request({"action": "destroy_all"})
destroyed += int(resp.get("destroyed") or 0)
except Exception:
pass
return {"destroyed": destroyed}
if action == "destroy_profile":
profile = self._normalize_profile(req.get("profile"))
with self._lock:
worker = self._workers.get(profile)
self._run_profile = {key: value for key, value in self._run_profile.items() if value != profile}
self._session_profile = {key: value for key, value in self._session_profile.items() if value != profile}
self._approval_profile = {key: value for key, value in self._approval_profile.items() if value != profile}
self._compression_profile = {key: value for key, value in self._compression_profile.items() if value != profile}
if worker is None or not worker.running:
if worker is not None:
worker.stop()
return {"profile": profile, "destroyed": 0}
try:
resp = worker.request({"action": "destroy_all"})
return {"profile": profile, "destroyed": int(resp.get("destroyed") or 0)}
except Exception:
return {"profile": profile, "destroyed": 0}
if action == "list":
sessions: list[Any] = []
with self._lock:
workers = list(self._workers.items())
for profile, worker in workers:
if not worker.running:
continue
try:
resp = worker.request({"action": "list"})
for session in resp.get("sessions") or []:
if isinstance(session, dict):
session.setdefault("profile", profile)
sessions.append(session)
except Exception:
pass
return {"sessions": sessions}
if action == "shutdown":
self._stop.set()
with self._lock:
workers = list(self._workers.values())
for worker in workers:
if not worker.running:
worker.stop()
continue
try:
worker.request({"action": "shutdown"})
except Exception:
worker.stop()
return {"status": "shutting_down"}
raise ValueError(f"unknown action: {action}")
def _make_server_socket(self) -> socket.socket:
return _make_listen_socket(self.endpoint)
def _read_request(self, conn: socket.socket) -> dict[str, Any]:
return _read_json_request(conn)
def _write_response(self, conn: socket.socket, resp: dict[str, Any]) -> None:
_write_json_response(conn, resp)
def _gc_idle_workers(self) -> None:
now = time.time()
if now - self._last_gc < self.GC_INTERVAL_SECONDS:
return
self._last_gc = now
with self._lock:
idle = [
profile for profile, worker in self._workers.items()
if worker.running and now - worker.last_used_at > self.IDLE_TIMEOUT_SECONDS
]
for profile in idle:
with self._lock:
worker = self._workers.pop(profile, None)
if worker:
worker.stop()
def serve_forever(self) -> None:
server = self._make_server_socket()
server.listen(64)
server.settimeout(0.2)
print(json.dumps({"event": "ready", "endpoint": self.endpoint, "mode": "broker"}), flush=True)
while not self._stop.is_set():
conn: socket.socket | None = None
try:
try:
conn, _addr = server.accept()
except socket.timeout:
self._gc_idle_workers()
continue
try:
req = self._read_request(conn)
data = self.handle(req)
resp = {"ok": True, **_jsonable(data)}
except Exception as exc:
resp = {
"ok": False,
"error": str(exc),
"error_type": exc.__class__.__name__,
}
self._write_response(conn, resp)
except KeyboardInterrupt:
break
except Exception as exc:
print(f"[hermes-bridge-broker] server loop error: {exc}", file=sys.stderr, flush=True)
finally:
if conn is not None:
try:
conn.close()
except OSError:
pass
with self._lock:
workers = list(self._workers.values())
self._workers.clear()
for worker in workers:
worker.stop()
server.close()
if self.endpoint.startswith("ipc://"):
try:
Path(self.endpoint.removeprefix("ipc://")).unlink(missing_ok=True)
except OSError:
pass
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Hermes AIAgent in-process bridge")
parser.add_argument("--endpoint", default=os.environ.get("HERMES_AGENT_BRIDGE_ENDPOINT", DEFAULT_ENDPOINT))
parser.add_argument("--agent-root", default=os.environ.get("HERMES_AGENT_ROOT", DEFAULT_AGENT_ROOT))
parser.add_argument("--hermes-home", default=os.environ.get("HERMES_HOME", DEFAULT_HERMES_HOME))
parser.add_argument("--worker-profile", default=os.environ.get("HERMES_AGENT_BRIDGE_WORKER_PROFILE"))
args = parser.parse_args(argv)
_set_path_env(args.agent_root, args.hermes_home)
_ensure_agent_imports()
BridgeServer(args.endpoint).serve_forever()
if args.worker_profile:
_set_worker_profile_env(str(args.worker_profile or "default"))
BridgeServer(args.endpoint).serve_forever()
else:
BridgeBroker(args.endpoint, args.agent_root, args.hermes_home).serve_forever()
return 0