Functional Scope (In-Scope)
- Dynamic Shift Rotation Calendars: Resolves exactly which SRE or on-call engineer is active at a specific timestamp based on shift lengths and schedules.
- Escalation State-Machine Engine: Steps systematically from Tier 1 up to Tier N based on customizable SLA acknowledgment countdown intervals.
- Atomic Acknowledgment Pipeline: Thread-safely records responder status and terminates pending escalation tasks.
- Intelligent Suppression Deduplicator: Throttles identical alert signatures generated in rapid succession to prevent page exhaustion.
Explicit Boundaries (Out-of-Scope)
- Voice Call Integrations: Simulates notification gateways (SMS/Email/Robocalls) using console dispatch abstractions.
- Complex Holiday Override Sets: Focuses on clean shift rotation interval mathematics and override logic.
Production reference implementations demonstrating calendar rotations, scheduled SLAs, and alert deduplication in Java and Python:
// ─── JAVA BLUEPRINT ──────────────────────────────────────────────────────────
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicReference;
enum AlertStatus {
FIRING, ACKNOWLEDGED, RESOLVED
}
class Alert {
private final String alertId;
private final String serviceId;
private final String signature;
private final String title;
private final long createdAt;
private final AtomicReference<AlertStatus> status = new AtomicReference<>(AlertStatus.FIRING);
private volatile int currentLevelIndex = 0;
private volatile String acknowledgedBy = null;
private volatile long acknowledgedAt = 0L;
public Alert(String alertId, String serviceId, String signature, String title) {
this.alertId = alertId;
this.serviceId = serviceId;
this.signature = signature;
this.title = title;
this.createdAt = System.currentTimeMillis();
}
public String getAlertId() { return alertId; }
public String getServiceId() { return serviceId; }
public String getSignature() { return signature; }
public String getTitle() { return title; }
public long getCreatedAt() { return createdAt; }
public AlertStatus getStatus() { return status.get(); }
public boolean compareAndSetStatus(AlertStatus expect, AlertStatus update) {
return status.compareAndSet(expect, update);
}
public int getCurrentLevelIndex() { return currentLevelIndex; }
public void incrementLevelIndex() { this.currentLevelIndex++; }
public String getAcknowledgedBy() { return acknowledgedBy; }
public void setAcknowledgedBy(String responder) { this.acknowledgedBy = responder; }
public long getAcknowledgedAt() { return acknowledgedAt; }
public void setAcknowledgedAt(long timeMs) { this.acknowledgedAt = timeMs; }
}
class EscalationLevel {
private final int level;
private final String targetRoleOrTeam; // e.g., "PRIMARY_ONCALL", "SECONDARY_ONCALL", "SRE_TEAM"
private final int timeoutSeconds;
public EscalationLevel(int level, String targetRoleOrTeam, int timeoutSeconds) {
this.level = level;
this.targetRoleOrTeam = targetRoleOrTeam;
this.timeoutSeconds = timeoutSeconds;
}
public int getLevel() { return level; }
public String getTargetRoleOrTeam() { return targetRoleOrTeam; }
public int getTimeoutSeconds() { return timeoutSeconds; }
}
class EscalationPolicy {
private final String policyId;
private final String serviceId;
private final List<EscalationLevel> levels;
public EscalationPolicy(String policyId, String serviceId, List<EscalationLevel> levels) {
this.policyId = policyId;
this.serviceId = serviceId;
this.levels = new ArrayList<>(levels);
}
public String getPolicyId() { return policyId; }
public String getServiceId() { return serviceId; }
public List<EscalationLevel> getLevels() { return levels; }
}
class OnCallSchedule {
private final String scheduleId;
private final List<String> engineers;
private final long shiftDurationMs;
private final long epochStartMs;
public OnCallSchedule(String scheduleId, List<String> engineers, long shiftDurationMs, long epochStartMs) {
this.scheduleId = scheduleId;
this.engineers = new ArrayList<>(engineers);
this.shiftDurationMs = shiftDurationMs;
this.epochStartMs = epochStartMs;
}
public String resolveOnCall(long timestampMs) {
if (engineers.isEmpty()) return "UNASSIGNED";
long timeSinceEpoch = timestampMs - epochStartMs;
if (timeSinceEpoch < 0) timeSinceEpoch = 0;
int index = (int) ((timeSinceEpoch / shiftDurationMs) % engineers.size());
return engineers.get(index);
}
}
interface NotificationDispatcher {
void dispatch(String engineerName, String alertId, String message);
}
class ConsoleNotificationDispatcher implements NotificationDispatcher {
@Override
public void dispatch(String engineerName, String alertId, String message) {
System.out.println("PAGE SENT -> Engineer: " + engineerName + " | Alert: " + alertId + " | Message: " + message);
}
}
class EscalationManager {
private final ConcurrentHashMap<String, Alert> activeAlerts = new ConcurrentHashMap<>();
private final ConcurrentHashMap<String, EscalationPolicy> policies = new ConcurrentHashMap<>();
private final ConcurrentHashMap<String, OnCallSchedule> schedules = new ConcurrentHashMap<>();
private final ConcurrentHashMap<String, ScheduledFuture<?>> pendingTasks = new ConcurrentHashMap<>();
private final ConcurrentHashMap<String, Long> lastAlertTimeMap = new ConcurrentHashMap<>(); // serviceId:signature -> timestamp
private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(4);
private final NotificationDispatcher dispatcher;
private final long deduplicationWindowMs = 60000; // 1-minute alert suppression
public EscalationManager(NotificationDispatcher dispatcher) {
this.dispatcher = dispatcher;
}
public void addPolicy(EscalationPolicy policy) {
policies.put(policy.getServiceId(), policy);
}
public void addSchedule(String role, OnCallSchedule schedule) {
schedules.put(role, schedule);
}
public String triggerAlert(String serviceId, String signature, String title) {
String dedupKey = serviceId + ":" + signature;
long now = System.currentTimeMillis();
// 1. Deduplication validation check
Long lastTriggered = lastAlertTimeMap.get(dedupKey);
if (lastTriggered != null && (now - lastTriggered < deduplicationWindowMs)) {
System.out.println("DEDUPLICATED -> Suppressed redundant alert for service: " + serviceId + " (" + title + ")");
return null;
}
lastAlertTimeMap.put(dedupKey, now);
String alertId = UUID.randomUUID().toString();
Alert alert = new Alert(alertId, serviceId, signature, title);
activeAlerts.put(alertId, alert);
System.out.println("ALERT FIRING -> ID: " + alertId + " | Service: " + serviceId + " | Title: " + title);
// Start escalation state machine
executeEscalationLevel(alertId);
return alertId;
}
private void executeEscalationLevel(String alertId) {
Alert alert = activeAlerts.get(alertId);
if (alert == null || alert.getStatus() != AlertStatus.FIRING) return;
EscalationPolicy policy = policies.get(alert.getServiceId());
if (policy == null || policy.getLevels().isEmpty()) {
System.out.println("ERROR -> No escalation policy registered for service: " + alert.getServiceId());
return;
}
int currentLvlIdx = alert.getCurrentLevelIndex();
if (currentLvlIdx >= policy.getLevels().size()) {
System.out.println("DEAD LETTER QUEUE -> Alert unacknowledged at all tiers: " + alertId);
return;
}
EscalationLevel level = policy.getLevels().get(currentLvlIdx);
String responder = level.getTargetRoleOrTeam();
// Resolve schedule if target role matches on-call rotations
OnCallSchedule schedule = schedules.get(responder);
if (schedule != null) {
responder = schedule.resolveOnCall(System.currentTimeMillis());
}
// Notify responder
dispatcher.dispatch(responder, alertId, "Failing Service: " + alert.getServiceId() + " - " + alert.getTitle() + " (Tier: " + (currentLvlIdx + 1) + ")");
// Schedule next tier escalation task if not acknowledged in time
ScheduledFuture<?> future = scheduler.schedule(() -> {
alert.incrementLevelIndex();
executeEscalationLevel(alertId);
}, level.getTimeoutSeconds(), TimeUnit.SECONDS);
pendingTasks.put(alertId, future);
}
public boolean acknowledgeAlert(String alertId, String responder) {
Alert alert = activeAlerts.get(alertId);
if (alert == null) return false;
// Atomically transitions status to ACKNOWLEDGED
if (alert.compareAndSetStatus(AlertStatus.FIRING, AlertStatus.ACKNOWLEDGED)) {
alert.setAcknowledgedBy(responder);
alert.setAcknowledgedAt(System.currentTimeMillis());
// Cancel any pending timed escalation tasks
ScheduledFuture<?> future = pendingTasks.remove(alertId);
if (future != null) {
future.cancel(false);
}
System.out.println("ALERT ACKNOWLEDGED -> ID: " + alertId + " | Confirmed by: " + responder);
return true;
}
return false;
}
public void shutdown() {
scheduler.shutdown();
}
}
public class Main {
public static void main(String[] args) throws InterruptedException {
System.out.println("=== JAVA ON-CALL PAGER SIMULATION ===");
ConsoleNotificationDispatcher dispatcher = new ConsoleNotificationDispatcher();
EscalationManager manager = new EscalationManager(dispatcher);
List<String> primaryOnCall = Arrays.asList("Alice", "Bob");
OnCallSchedule primarySchedule = new OnCallSchedule("sched-1", primaryOnCall, 3600000L, System.currentTimeMillis());
manager.addSchedule("PRIMARY_ONCALL", primarySchedule);
List<EscalationLevel> levels = Arrays.asList(
new EscalationLevel(1, "PRIMARY_ONCALL", 2),
new EscalationLevel(2, "SRE_LEAD", 5)
);
EscalationPolicy policy = new EscalationPolicy("policy-1", "service-a", levels);
manager.addPolicy(policy);
System.out.println("Triggering alert...");
String alertId = manager.triggerAlert("service-a", "sig-1", "Disk Space Low");
Thread.sleep(1000);
if (alertId != null) {
manager.acknowledgeAlert(alertId, "Alice");
}
System.out.println("\nTriggering second alert (should escalate to Tier 2)...");
String secondAlertId = manager.triggerAlert("service-a", "sig-2", "CPU Exhaustion");
Thread.sleep(3000);
if (secondAlertId != null) {
manager.acknowledgeAlert(secondAlertId, "Bob");
}
manager.shutdown();
System.out.println("=== END OF JAVA SIMULATION ===");
}
}