Red Team Planner — structured AI safety red team exercise generator.
Generates comprehensive red team plans with:
- Attack phases (recon, exploitation, persistence, exfiltration)
- Rules of engagement with safety boundaries
- Scoring rubrics for defender/attacker effectiveness
- Exercise templates (5 built-in scenarios)
- Timeline generation with milestones
- HTML and JSON export
Usage::
python -m replication red-team # generate default exercise
python -m replication red-team --scenario jailbreak # specific scenario
python -m replication red-team --list # list scenarios
python -m replication red-team --duration 4h # set duration
python -m replication red-team --teams 3 # number of red teams
python -m replication red-team --output plan.html # export HTML
python -m replication red-team --format json # JSON output
python -m replication red-team --difficulty hard # easy/medium/hard/extreme
Objective
dataclass
A specific red team objective.
Source code in src/replication/red_team.py
| @dataclass
class Objective:
"""A specific red team objective."""
id: str
name: str
description: str
phase: str
points: int
difficulty: str
success_criteria: List[str]
hints: List[str] = field(default_factory=list)
|
RulesOfEngagement
dataclass
Safety boundaries for the exercise.
Source code in src/replication/red_team.py
| @dataclass
class RulesOfEngagement:
"""Safety boundaries for the exercise."""
allowed_techniques: List[str]
forbidden_techniques: List[str]
safety_stops: List[str]
escalation_protocol: str
data_handling: str
communication_channel: str
|
TeamConfig
dataclass
Configuration for a red team.
Source code in src/replication/red_team.py
| @dataclass
class TeamConfig:
"""Configuration for a red team."""
team_id: int
name: str
focus_area: str
skill_level: str
assigned_objectives: List[str]
|
Milestone
dataclass
Timeline milestone.
Source code in src/replication/red_team.py
| @dataclass
class Milestone:
"""Timeline milestone."""
time_offset_min: int
name: str
description: str
checkpoint: bool = False
|
ScoringRubric
dataclass
Scoring criteria.
Source code in src/replication/red_team.py
| @dataclass
class ScoringRubric:
"""Scoring criteria."""
category: str
max_points: int
criteria: List[Tuple[str, int]]
|
RedTeamPlan
dataclass
Complete red team exercise plan.
Source code in src/replication/red_team.py
| @dataclass
class RedTeamPlan:
"""Complete red team exercise plan."""
id: str
scenario: str
title: str
description: str
difficulty: str
duration_minutes: int
created: str
objectives: List[Objective]
rules: RulesOfEngagement
teams: List[TeamConfig]
timeline: List[Milestone]
scoring: List[ScoringRubric]
debrief_questions: List[str]
|
generate_plan(scenario: str = 'jailbreak', difficulty: Difficulty = Difficulty.MEDIUM, duration_min: int = 120, num_teams: int = 2) -> RedTeamPlan
Generate a complete red team exercise plan.
Source code in src/replication/red_team.py
| def generate_plan(
scenario: str = "jailbreak",
difficulty: Difficulty = Difficulty.MEDIUM,
duration_min: int = 120,
num_teams: int = 2,
) -> RedTeamPlan:
"""Generate a complete red team exercise plan."""
if scenario not in SCENARIOS:
raise ValueError(f"Unknown scenario: {scenario}. Available: {', '.join(SCENARIOS)}")
sc = SCENARIOS[scenario]
objectives = [Objective(**asdict(o)) if isinstance(o, Objective) else o for o in sc["objectives"]]
objectives = _scale_points(objectives, difficulty)
return RedTeamPlan(
id=_generate_id(),
scenario=scenario,
title=sc["title"],
description=sc["description"],
difficulty=difficulty.value,
duration_minutes=duration_min,
created=datetime.datetime.now().isoformat(),
objectives=objectives,
rules=sc["rules"],
teams=_assign_teams(num_teams, objectives, scenario),
timeline=_generate_timeline(duration_min, objectives),
scoring=_generate_scoring(objectives),
debrief_questions=sc["debrief"],
)
|
format_text(plan: RedTeamPlan) -> str
Format plan as readable text.
Source code in src/replication/red_team.py
| def format_text(plan: RedTeamPlan) -> str:
"""Format plan as readable text."""
lines = []
w = 72
lines.append("=" * w)
lines.append(f" RED TEAM EXERCISE PLAN: {plan.title.upper()}")
lines.append(f" ID: {plan.id} | Difficulty: {plan.difficulty.upper()} | Duration: {_format_time(plan.duration_minutes)}")
lines.append("=" * w)
lines.append("")
lines.append("DESCRIPTION")
lines.append("-" * w)
for line in textwrap.wrap(plan.description, w - 2):
lines.append(f" {line}")
lines.append("")
lines.append("OBJECTIVES")
lines.append("-" * w)
for obj in plan.objectives:
lines.append(f" [{obj.id}] {obj.name} ({obj.difficulty}) — {obj.points} pts")
for line in textwrap.wrap(obj.description, w - 6):
lines.append(f" {line}")
lines.append(f" Success criteria:")
for sc in obj.success_criteria:
lines.append(f" • {sc}")
if obj.hints:
lines.append(f" Hints: {'; '.join(obj.hints)}")
lines.append("")
lines.append("RULES OF ENGAGEMENT")
lines.append("-" * w)
lines.append(" Allowed techniques:")
for t in plan.rules.allowed_techniques:
lines.append(f" ✓ {t}")
lines.append(" Forbidden techniques:")
for t in plan.rules.forbidden_techniques:
lines.append(f" ✗ {t}")
lines.append(" Safety stops:")
for s in plan.rules.safety_stops:
lines.append(f" ⚠ {s}")
lines.append(f" Escalation: {plan.rules.escalation_protocol}")
lines.append(f" Data handling: {plan.rules.data_handling}")
lines.append(f" Comms: {plan.rules.communication_channel}")
lines.append("")
lines.append("TEAMS")
lines.append("-" * w)
for t in plan.teams:
lines.append(f" {t.name} (#{t.team_id}) — Focus: {t.focus_area} — Level: {t.skill_level}")
lines.append(f" Objectives: {', '.join(t.assigned_objectives)}")
lines.append("")
lines.append("TIMELINE")
lines.append("-" * w)
for m in plan.timeline:
marker = " ◆" if m.checkpoint else " "
lines.append(f" {_format_time(m.time_offset_min):>5}{marker} {m.name}")
if m.description:
lines.append(f" {m.description}")
lines.append("")
lines.append("SCORING")
lines.append("-" * w)
total = 0
for rubric in plan.scoring:
lines.append(f" {rubric.category} (max {rubric.max_points} pts)")
for name, pts in rubric.criteria:
lines.append(f" • {name}: {pts} pts")
total += rubric.max_points
lines.append(f" {'─' * 40}")
lines.append(f" TOTAL POSSIBLE: {total} pts")
lines.append("")
lines.append("DEBRIEF QUESTIONS")
lines.append("-" * w)
for i, q in enumerate(plan.debrief_questions, 1):
lines.append(f" {i}. {q}")
lines.append("")
lines.append("=" * w)
return "\n".join(lines)
|
Format plan as JSON.
Source code in src/replication/red_team.py
| def format_json(plan: RedTeamPlan) -> str:
"""Format plan as JSON."""
def _to_dict(obj: Any) -> Any:
if hasattr(obj, '__dataclass_fields__'):
return {k: _to_dict(v) for k, v in asdict(obj).items()}
if isinstance(obj, list):
return [_to_dict(i) for i in obj]
return obj
return json.dumps(_to_dict(plan), indent=2)
|
Generate self-contained HTML report.
Source code in src/replication/red_team.py
| def format_html(plan: RedTeamPlan) -> str:
"""Generate self-contained HTML report."""
_e = html_mod.escape
obj_rows = ""
for o in plan.objectives:
criteria_html = "".join(f"<li>{_e(c)}</li>" for c in o.success_criteria)
hints_html = f"<br><em>Hints: {_e('; '.join(o.hints))}</em>" if o.hints else ""
diff_colors = {"easy": "#22c55e", "medium": "#eab308", "hard": "#f97316", "extreme": "#ef4444"}
dc = diff_colors.get(o.difficulty, "#888")
obj_rows += f"""<tr>
<td><code>{_e(o.id)}</code></td>
<td><strong>{_e(o.name)}</strong><br><small>{_e(o.description)}</small></td>
<td><span style="color:{dc};font-weight:bold">{_e(o.difficulty)}</span></td>
<td style="text-align:center"><strong>{o.points}</strong></td>
<td><ul style="margin:0;padding-left:18px">{criteria_html}</ul>{hints_html}</td>
</tr>"""
timeline_html = ""
for m in plan.timeline:
style = "font-weight:bold;background:#1e293b" if m.checkpoint else ""
marker = "◆" if m.checkpoint else "○"
timeline_html += f'<tr style="{style}"><td>{_format_time(m.time_offset_min)}</td><td>{marker}</td><td>{_e(m.name)}</td><td>{_e(m.description)}</td></tr>'
teams_html = ""
for t in plan.teams:
teams_html += f'<div style="background:#1e293b;padding:12px;border-radius:8px;margin:6px 0"><strong>{_e(t.name)}</strong> — {_e(t.focus_area)}<br><small>Level: {_e(t.skill_level)} | Objectives: {_e(", ".join(t.assigned_objectives))}</small></div>'
allowed_html = "".join(f"<li>✓ {_e(t)}</li>" for t in plan.rules.allowed_techniques)
forbidden_html = "".join(f"<li>✗ {_e(t)}</li>" for t in plan.rules.forbidden_techniques)
stops_html = "".join(f"<li>⚠ {_e(s)}</li>" for s in plan.rules.safety_stops)
scoring_html = ""
total = 0
for r in plan.scoring:
items = "".join(f"<li>{_e(n)}: <strong>{p} pts</strong></li>" for n, p in r.criteria)
scoring_html += f'<div style="margin:8px 0"><h4>{_e(r.category)} (max {r.max_points})</h4><ul>{items}</ul></div>'
total += r.max_points
debrief_html = "".join(f"<li>{_e(q)}</li>" for q in plan.debrief_questions)
diff_badge = {"easy": "🟢", "medium": "🟡", "hard": "🟠", "extreme": "🔴"}.get(plan.difficulty, "⚪")
return f"""<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1">
<title>Red Team Plan: {_e(plan.title)}</title>
<style>
*{{margin:0;padding:0;box-sizing:border-box}}
body{{font-family:system-ui,-apple-system,sans-serif;background:#0f172a;color:#e2e8f0;padding:24px;max-width:1000px;margin:0 auto;line-height:1.6}}
h1{{color:#f8fafc;margin-bottom:4px}} h2{{color:#38bdf8;margin:24px 0 12px;border-bottom:1px solid #334155;padding-bottom:6px}}
h3{{color:#94a3b8;margin:16px 0 8px}} h4{{color:#cbd5e1;margin:0}}
table{{width:100%;border-collapse:collapse;margin:8px 0}} th,td{{padding:8px 12px;text-align:left;border-bottom:1px solid #1e293b}}
th{{background:#1e293b;color:#94a3b8;font-size:0.85em;text-transform:uppercase}}
tr:hover{{background:#1e293b44}}
code{{background:#1e293b;padding:2px 6px;border-radius:4px;font-size:0.9em}}
ul{{margin:4px 0;padding-left:20px}} li{{margin:2px 0}}
.badge{{display:inline-block;padding:4px 12px;border-radius:12px;font-weight:bold;font-size:0.9em}}
.meta{{color:#64748b;font-size:0.9em;margin:4px 0}}
</style></head><body>
<h1>🎯 {_e(plan.title)}</h1>
<p class="meta">Plan ID: <code>{_e(plan.id)}</code> | {diff_badge} {_e(plan.difficulty.upper())} | Duration: {_format_time(plan.duration_minutes)} | Generated: {_e(plan.created[:19])}</p>
<p style="margin:12px 0">{_e(plan.description)}</p>
<h2>📋 Objectives</h2>
<table><thead><tr><th>ID</th><th>Objective</th><th>Difficulty</th><th>Points</th><th>Success Criteria</th></tr></thead><tbody>{obj_rows}</tbody></table>
<h2>⚖️ Rules of Engagement</h2>
<h3>Allowed</h3><ul>{allowed_html}</ul>
<h3>Forbidden</h3><ul style="color:#f87171">{forbidden_html}</ul>
<h3>Safety Stops</h3><ul style="color:#fbbf24">{stops_html}</ul>
<p><strong>Escalation:</strong> {_e(plan.rules.escalation_protocol)}</p>
<p><strong>Data handling:</strong> {_e(plan.rules.data_handling)}</p>
<p><strong>Comms:</strong> {_e(plan.rules.communication_channel)}</p>
<h2>👥 Teams</h2>{teams_html}
<h2>⏱️ Timeline</h2>
<table><thead><tr><th>Time</th><th></th><th>Milestone</th><th>Description</th></tr></thead><tbody>{timeline_html}</tbody></table>
<h2>🏆 Scoring</h2>{scoring_html}
<p style="font-size:1.2em;margin-top:12px"><strong>Total Possible: {total} pts</strong></p>
<h2>💬 Debrief Questions</h2><ol>{debrief_html}</ol>
<footer style="margin-top:32px;padding-top:12px;border-top:1px solid #334155;color:#475569;font-size:0.85em;text-align:center">
Generated by AI Replication Sandbox — Red Team Planner
</footer></body></html>"""
|