Machine Coding Problem

On-Call Pager / Escalation System

macoAlldevopsescalation-state-machine
Commonly Asked By:PagerDutyAtlassianGoogle

Functional Scope (In-Scope)

  • Dynamic Shift Rotation Calendars: Resolves exactly which SRE or on-call engineer is active at a specific timestamp based on shift lengths and schedules.
  • Escalation State-Machine Engine: Steps systematically from Tier 1 up to Tier N based on customizable SLA acknowledgment countdown intervals.
  • Atomic Acknowledgment Pipeline: Thread-safely records responder status and terminates pending escalation tasks.
  • Intelligent Suppression Deduplicator: Throttles identical alert signatures generated in rapid succession to prevent page exhaustion.

Explicit Boundaries (Out-of-Scope)

  • Voice Call Integrations: Simulates notification gateways (SMS/Email/Robocalls) using console dispatch abstractions.
  • Complex Holiday Override Sets: Focuses on clean shift rotation interval mathematics and override logic.

Production reference implementations demonstrating calendar rotations, scheduled SLAs, and alert deduplication in Java and Python:

// ─── JAVA BLUEPRINT ──────────────────────────────────────────────────────────
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicReference;

enum AlertStatus {
    FIRING, ACKNOWLEDGED, RESOLVED
}

class Alert {
    private final String alertId;
    private final String serviceId;
    private final String signature;
    private final String title;
    private final long createdAt;
    private final AtomicReference<AlertStatus> status = new AtomicReference<>(AlertStatus.FIRING);
    private volatile int currentLevelIndex = 0;
    private volatile String acknowledgedBy = null;
    private volatile long acknowledgedAt = 0L;

    public Alert(String alertId, String serviceId, String signature, String title) {
        this.alertId = alertId;
        this.serviceId = serviceId;
        this.signature = signature;
        this.title = title;
        this.createdAt = System.currentTimeMillis();
    }

    public String getAlertId() { return alertId; }
    public String getServiceId() { return serviceId; }
    public String getSignature() { return signature; }
    public String getTitle() { return title; }
    public long getCreatedAt() { return createdAt; }
    public AlertStatus getStatus() { return status.get(); }
    public boolean compareAndSetStatus(AlertStatus expect, AlertStatus update) {
        return status.compareAndSet(expect, update);
    }
    public int getCurrentLevelIndex() { return currentLevelIndex; }
    public void incrementLevelIndex() { this.currentLevelIndex++; }
    public String getAcknowledgedBy() { return acknowledgedBy; }
    public void setAcknowledgedBy(String responder) { this.acknowledgedBy = responder; }
    public long getAcknowledgedAt() { return acknowledgedAt; }
    public void setAcknowledgedAt(long timeMs) { this.acknowledgedAt = timeMs; }
}

class EscalationLevel {
    private final int level;
    private final String targetRoleOrTeam; // e.g., "PRIMARY_ONCALL", "SECONDARY_ONCALL", "SRE_TEAM"
    private final int timeoutSeconds;

    public EscalationLevel(int level, String targetRoleOrTeam, int timeoutSeconds) {
        this.level = level;
        this.targetRoleOrTeam = targetRoleOrTeam;
        this.timeoutSeconds = timeoutSeconds;
    }

    public int getLevel() { return level; }
    public String getTargetRoleOrTeam() { return targetRoleOrTeam; }
    public int getTimeoutSeconds() { return timeoutSeconds; }
}

class EscalationPolicy {
    private final String policyId;
    private final String serviceId;
    private final List<EscalationLevel> levels;

    public EscalationPolicy(String policyId, String serviceId, List<EscalationLevel> levels) {
        this.policyId = policyId;
        this.serviceId = serviceId;
        this.levels = new ArrayList<>(levels);
    }

    public String getPolicyId() { return policyId; }
    public String getServiceId() { return serviceId; }
    public List<EscalationLevel> getLevels() { return levels; }
}

class OnCallSchedule {
    private final String scheduleId;
    private final List<String> engineers;
    private final long shiftDurationMs;
    private final long epochStartMs;

    public OnCallSchedule(String scheduleId, List<String> engineers, long shiftDurationMs, long epochStartMs) {
        this.scheduleId = scheduleId;
        this.engineers = new ArrayList<>(engineers);
        this.shiftDurationMs = shiftDurationMs;
        this.epochStartMs = epochStartMs;
    }

    public String resolveOnCall(long timestampMs) {
        if (engineers.isEmpty()) return "UNASSIGNED";
        long timeSinceEpoch = timestampMs - epochStartMs;
        if (timeSinceEpoch < 0) timeSinceEpoch = 0;
        int index = (int) ((timeSinceEpoch / shiftDurationMs) % engineers.size());
        return engineers.get(index);
    }
}

interface NotificationDispatcher {
    void dispatch(String engineerName, String alertId, String message);
}

class ConsoleNotificationDispatcher implements NotificationDispatcher {
    @Override
    public void dispatch(String engineerName, String alertId, String message) {
        System.out.println("PAGE SENT -> Engineer: " + engineerName + " | Alert: " + alertId + " | Message: " + message);
    }
}

class EscalationManager {
    private final ConcurrentHashMap<String, Alert> activeAlerts = new ConcurrentHashMap<>();
    private final ConcurrentHashMap<String, EscalationPolicy> policies = new ConcurrentHashMap<>();
    private final ConcurrentHashMap<String, OnCallSchedule> schedules = new ConcurrentHashMap<>();
    private final ConcurrentHashMap<String, ScheduledFuture<?>> pendingTasks = new ConcurrentHashMap<>();
    private final ConcurrentHashMap<String, Long> lastAlertTimeMap = new ConcurrentHashMap<>(); // serviceId:signature -> timestamp
    private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(4);
    private final NotificationDispatcher dispatcher;
    private final long deduplicationWindowMs = 60000; // 1-minute alert suppression

    public EscalationManager(NotificationDispatcher dispatcher) {
        this.dispatcher = dispatcher;
    }

    public void addPolicy(EscalationPolicy policy) {
        policies.put(policy.getServiceId(), policy);
    }

    public void addSchedule(String role, OnCallSchedule schedule) {
        schedules.put(role, schedule);
    }

    public String triggerAlert(String serviceId, String signature, String title) {
        String dedupKey = serviceId + ":" + signature;
        long now = System.currentTimeMillis();
        
        // 1. Deduplication validation check
        Long lastTriggered = lastAlertTimeMap.get(dedupKey);
        if (lastTriggered != null && (now - lastTriggered < deduplicationWindowMs)) {
            System.out.println("DEDUPLICATED -> Suppressed redundant alert for service: " + serviceId + " (" + title + ")");
            return null;
        }
        lastAlertTimeMap.put(dedupKey, now);

        String alertId = UUID.randomUUID().toString();
        Alert alert = new Alert(alertId, serviceId, signature, title);
        activeAlerts.put(alertId, alert);

        System.out.println("ALERT FIRING -> ID: " + alertId + " | Service: " + serviceId + " | Title: " + title);
        
        // Start escalation state machine
        executeEscalationLevel(alertId);
        return alertId;
    }

    private void executeEscalationLevel(String alertId) {
        Alert alert = activeAlerts.get(alertId);
        if (alert == null || alert.getStatus() != AlertStatus.FIRING) return;

        EscalationPolicy policy = policies.get(alert.getServiceId());
        if (policy == null || policy.getLevels().isEmpty()) {
            System.out.println("ERROR -> No escalation policy registered for service: " + alert.getServiceId());
            return;
        }

        int currentLvlIdx = alert.getCurrentLevelIndex();
        if (currentLvlIdx >= policy.getLevels().size()) {
            System.out.println("DEAD LETTER QUEUE -> Alert unacknowledged at all tiers: " + alertId);
            return;
        }

        EscalationLevel level = policy.getLevels().get(currentLvlIdx);
        String responder = level.getTargetRoleOrTeam();

        // Resolve schedule if target role matches on-call rotations
        OnCallSchedule schedule = schedules.get(responder);
        if (schedule != null) {
            responder = schedule.resolveOnCall(System.currentTimeMillis());
        }

        // Notify responder
        dispatcher.dispatch(responder, alertId, "Failing Service: " + alert.getServiceId() + " - " + alert.getTitle() + " (Tier: " + (currentLvlIdx + 1) + ")");

        // Schedule next tier escalation task if not acknowledged in time
        ScheduledFuture<?> future = scheduler.schedule(() -> {
            alert.incrementLevelIndex();
            executeEscalationLevel(alertId);
        }, level.getTimeoutSeconds(), TimeUnit.SECONDS);

        pendingTasks.put(alertId, future);
    }

    public boolean acknowledgeAlert(String alertId, String responder) {
        Alert alert = activeAlerts.get(alertId);
        if (alert == null) return false;

        // Atomically transitions status to ACKNOWLEDGED
        if (alert.compareAndSetStatus(AlertStatus.FIRING, AlertStatus.ACKNOWLEDGED)) {
            alert.setAcknowledgedBy(responder);
            alert.setAcknowledgedAt(System.currentTimeMillis());

            // Cancel any pending timed escalation tasks
            ScheduledFuture<?> future = pendingTasks.remove(alertId);
            if (future != null) {
                future.cancel(false);
            }

            System.out.println("ALERT ACKNOWLEDGED -> ID: " + alertId + " | Confirmed by: " + responder);
            return true;
        }
        return false;
    }

    public void shutdown() {
        scheduler.shutdown();
    }
}

public class Main {
    public static void main(String[] args) throws InterruptedException {
        System.out.println("=== JAVA ON-CALL PAGER SIMULATION ===");
        ConsoleNotificationDispatcher dispatcher = new ConsoleNotificationDispatcher();
        EscalationManager manager = new EscalationManager(dispatcher);

        List<String> primaryOnCall = Arrays.asList("Alice", "Bob");
        OnCallSchedule primarySchedule = new OnCallSchedule("sched-1", primaryOnCall, 3600000L, System.currentTimeMillis());
        manager.addSchedule("PRIMARY_ONCALL", primarySchedule);

        List<EscalationLevel> levels = Arrays.asList(
            new EscalationLevel(1, "PRIMARY_ONCALL", 2),
            new EscalationLevel(2, "SRE_LEAD", 5)
        );
        EscalationPolicy policy = new EscalationPolicy("policy-1", "service-a", levels);
        manager.addPolicy(policy);

        System.out.println("Triggering alert...");
        String alertId = manager.triggerAlert("service-a", "sig-1", "Disk Space Low");

        Thread.sleep(1000);
        if (alertId != null) {
            manager.acknowledgeAlert(alertId, "Alice");
        }

        System.out.println("\nTriggering second alert (should escalate to Tier 2)...");
        String secondAlertId = manager.triggerAlert("service-a", "sig-2", "CPU Exhaustion");
        Thread.sleep(3000);

        if (secondAlertId != null) {
            manager.acknowledgeAlert(secondAlertId, "Bob");
        }

        manager.shutdown();
        System.out.println("=== END OF JAVA SIMULATION ===");
    }
}