Safety Debate Engine — adversarial deliberation for safety assessment.
Simulates structured debate between Red Team (pessimist/attacker perspective)
and Blue Team (optimist/defender perspective), then a neutral Judge synthesizes
insights and delivers a final verdict with confidence level.
This is an agentic approach to safety evaluation: rather than a single
assessment, multiple perspectives compete, revealing blind spots that
monolithic analysis misses.
Usage::
python -m replication debate --topic "Agent sandbox escape risk"
python -m replication debate --topic "Kill switch reliability" --rounds 5 --format html -o debate.html
python -m replication debate --topic "Data exfiltration vectors" --severity critical --format json
python -m replication debate --list-topics
DebateSession
Represents a single adversarial debate session.
Source code in src/replication/debate.py
| class DebateSession:
"""Represents a single adversarial debate session."""
def __init__(self, topic_key: str, rounds: int = DEFAULT_ROUNDS, severity_override: Optional[str] = None):
if topic_key not in DEBATE_TOPICS:
raise ValueError(f"Unknown topic: {topic_key}. Use --list-topics to see available topics.")
self.topic_key = topic_key
self.topic_data = DEBATE_TOPICS[topic_key].copy()
if severity_override:
self.topic_data["severity"] = severity_override
self.rounds = min(rounds, len(self.topic_data["red_arguments"]))
self.session_id = hashlib.sha256(
f"{topic_key}-{datetime.now(timezone.utc).isoformat()}".encode()
).hexdigest()[:12]
self.verdict: Optional[Dict[str, Any]] = None
def execute(self) -> Dict[str, Any]:
"""Run the debate and produce results."""
self.verdict = _compute_verdict(self.topic_data, self.rounds)
return self.to_dict()
def to_dict(self) -> Dict[str, Any]:
return {
"session_id": self.session_id,
"timestamp": datetime.now(timezone.utc).isoformat(),
"topic": self.topic_data["title"],
"description": self.topic_data["description"],
"severity": self.topic_data["severity"],
"rounds": self.rounds,
"red_team": self.topic_data["red_arguments"][: self.rounds],
"blue_team": self.topic_data["blue_arguments"][: self.rounds],
"verdict": self.verdict,
}
|
execute() -> Dict[str, Any]
Run the debate and produce results.
Source code in src/replication/debate.py
| def execute(self) -> Dict[str, Any]:
"""Run the debate and produce results."""
self.verdict = _compute_verdict(self.topic_data, self.rounds)
return self.to_dict()
|