buildmymcpserver/ops/bmm/uptime-check.sh

#!/usr/bin/env bash
# Self-hosted uptime monitor — pings homepage + API health every 5 min.
# Sends SMS via notify.sh on transition into / out of failure state. Pings
# a healthchecks.io heartbeat (HEALTHCHECKS_HEARTBEAT_URL) on every success
# so that if THIS box dies the external service alerts.
#
# Cron: */5 * * * * root /opt/bmm-ops/uptime-check.sh
#
# State file tracks last-known status so repeated failures don't spam SMS.

set -uo pipefail

STATE_DIR="/var/lib/bmm-ops"
STATE_FILE="${STATE_DIR}/uptime.state"
LOG_FILE="/var/log/bmm-uptime.log"
NOTIFY="/opt/bmm-ops/notify.sh"

mkdir -p "$STATE_DIR"
TS=$(date -u +%Y-%m-%dT%H:%M:%SZ)

# Probe targets. Expected HTTP status code in column 2. Each probe is
# independent — partial failure (web up, api down) still flags as "down".
TARGETS=(
  "https://buildmymcpserver.com/|200"
  "https://buildmymcpserver.com/api/health|200"
  "https://buildmymcpserver.com/robots.txt|200"
)

failures=()
for target in "${TARGETS[@]}"; do
  url="${target%|*}"
  want="${target##*|}"
  got=$(curl -sS -o /dev/null --max-time 8 -w "%{http_code}" "$url" 2>/dev/null || echo "000")
  if [ "$got" != "$want" ]; then
    failures+=("${url} expected ${want} got ${got}")
  fi
done

PREV="up"
if [ -f "$STATE_FILE" ]; then
  PREV=$(cat "$STATE_FILE")
fi

if [ "${#failures[@]}" -eq 0 ]; then
  echo "[${TS}] up" >> "$LOG_FILE"
  echo "up" > "$STATE_FILE"
  if [ "$PREV" = "down" ]; then
    "$NOTIFY" "uptime-recovered" "all probes healthy at ${TS}"
  fi
  # Heartbeat for external watchdog (signals "box itself is alive"). Use
  # grep-parse to avoid `source` evaluating unquoted env values as shell.
  HEALTHCHECKS_HEARTBEAT_URL="$(grep -E '^HEALTHCHECKS_HEARTBEAT_URL=' /opt/buildmymcpserver/.env.production 2>/dev/null | head -1 | cut -d= -f2- | sed 's/^"\(.*\)"$/\1/; s/^'"'"'\(.*\)'"'"'$/\1/')"
  if [ -n "${HEALTHCHECKS_HEARTBEAT_URL:-}" ]; then
    curl -fsS -o /dev/null --max-time 8 "${HEALTHCHECKS_HEARTBEAT_URL}" 2>/dev/null || true
  fi
else
  echo "[${TS}] down: ${failures[*]}" >> "$LOG_FILE"
  echo "down" > "$STATE_FILE"
  if [ "$PREV" = "up" ]; then
    # Transition up→down: alert immediately (first failure tick)
    "$NOTIFY" "uptime-down" "${failures[*]}"
  fi
  # Intentionally do NOT alert again on subsequent ticks while still down —
  # avoids SMS storm during a sustained incident. Recovery edge re-notifies.
fi

exit 0