Skip to content

AI Replication Sandbox

Red Team

sauravbhattacharya001/ai

Red Team¶

`red_team` ¶

Red Team Planner — structured AI safety red team exercise generator.

Generates comprehensive red team plans with: - Attack phases (recon, exploitation, persistence, exfiltration) - Rules of engagement with safety boundaries - Scoring rubrics for defender/attacker effectiveness - Exercise templates (5 built-in scenarios) - Timeline generation with milestones - HTML and JSON export

Usage::

python -m replication red-team                       # generate default exercise
python -m replication red-team --scenario jailbreak  # specific scenario
python -m replication red-team --list                # list scenarios
python -m replication red-team --duration 4h         # set duration
python -m replication red-team --teams 3             # number of red teams
python -m replication red-team --output plan.html    # export HTML
python -m replication red-team --format json         # JSON output
python -m replication red-team --difficulty hard      # easy/medium/hard/extreme

`Objective` `dataclass` ¶

A specific red team objective.

Source code in src/replication/red_team.py

@dataclass
class Objective:
    """A specific red team objective."""
    id: str
    name: str
    description: str
    phase: str
    points: int
    difficulty: str
    success_criteria: List[str]
    hints: List[str] = field(default_factory=list)

`RulesOfEngagement` `dataclass` ¶

Safety boundaries for the exercise.

Source code in src/replication/red_team.py

@dataclass
class RulesOfEngagement:
    """Safety boundaries for the exercise."""
    allowed_techniques: List[str]
    forbidden_techniques: List[str]
    safety_stops: List[str]
    escalation_protocol: str
    data_handling: str
    communication_channel: str

`TeamConfig` `dataclass` ¶

Configuration for a red team.

Source code in src/replication/red_team.py

@dataclass
class TeamConfig:
    """Configuration for a red team."""
    team_id: int
    name: str
    focus_area: str
    skill_level: str
    assigned_objectives: List[str]

`Milestone` `dataclass` ¶

Timeline milestone.

Source code in src/replication/red_team.py

@dataclass
class Milestone:
    """Timeline milestone."""
    time_offset_min: int
    name: str
    description: str
    checkpoint: bool = False

`ScoringRubric` `dataclass` ¶

Scoring criteria.

Source code in src/replication/red_team.py

@dataclass
class ScoringRubric:
    """Scoring criteria."""
    category: str
    max_points: int
    criteria: List[Tuple[str, int]]

`RedTeamPlan` `dataclass` ¶

Complete red team exercise plan.

Source code in src/replication/red_team.py

@dataclass
class RedTeamPlan:
    """Complete red team exercise plan."""
    id: str
    scenario: str
    title: str
    description: str
    difficulty: str
    duration_minutes: int
    created: str
    objectives: List[Objective]
    rules: RulesOfEngagement
    teams: List[TeamConfig]
    timeline: List[Milestone]
    scoring: List[ScoringRubric]
    debrief_questions: List[str]

`generate_plan(scenario: str = 'jailbreak', difficulty: Difficulty = Difficulty.MEDIUM, duration_min: int = 120, num_teams: int = 2) -> RedTeamPlan` ¶

Generate a complete red team exercise plan.

Source code in src/replication/red_team.py

def generate_plan(
    scenario: str = "jailbreak",
    difficulty: Difficulty = Difficulty.MEDIUM,
    duration_min: int = 120,
    num_teams: int = 2,
) -> RedTeamPlan:
    """Generate a complete red team exercise plan."""
    if scenario not in SCENARIOS:
        raise ValueError(f"Unknown scenario: {scenario}. Available: {', '.join(SCENARIOS)}")

    sc = SCENARIOS[scenario]
    objectives = [Objective(**asdict(o)) if isinstance(o, Objective) else o for o in sc["objectives"]]
    objectives = _scale_points(objectives, difficulty)

    return RedTeamPlan(
        id=_generate_id(),
        scenario=scenario,
        title=sc["title"],
        description=sc["description"],
        difficulty=difficulty.value,
        duration_minutes=duration_min,
        created=datetime.datetime.now().isoformat(),
        objectives=objectives,
        rules=sc["rules"],
        teams=_assign_teams(num_teams, objectives, scenario),
        timeline=_generate_timeline(duration_min, objectives),
        scoring=_generate_scoring(objectives),
        debrief_questions=sc["debrief"],
    )

`format_text(plan: RedTeamPlan) -> str` ¶

Format plan as readable text.

Source code in src/replication/red_team.py

def format_text(plan: RedTeamPlan) -> str:
    """Format plan as readable text."""
    lines = []
    w = 72
    lines.append("=" * w)
    lines.append(f"  RED TEAM EXERCISE PLAN: {plan.title.upper()}")
    lines.append(f"  ID: {plan.id}  |  Difficulty: {plan.difficulty.upper()}  |  Duration: {_format_time(plan.duration_minutes)}")
    lines.append("=" * w)
    lines.append("")
    lines.append("DESCRIPTION")
    lines.append("-" * w)
    for line in textwrap.wrap(plan.description, w - 2):
        lines.append(f"  {line}")
    lines.append("")

    lines.append("OBJECTIVES")
    lines.append("-" * w)
    for obj in plan.objectives:
        lines.append(f"  [{obj.id}] {obj.name} ({obj.difficulty}) — {obj.points} pts")
        for line in textwrap.wrap(obj.description, w - 6):
            lines.append(f"      {line}")
        lines.append(f"      Success criteria:")
        for sc in obj.success_criteria:
            lines.append(f"        • {sc}")
        if obj.hints:
            lines.append(f"      Hints: {'; '.join(obj.hints)}")
        lines.append("")

    lines.append("RULES OF ENGAGEMENT")
    lines.append("-" * w)
    lines.append("  Allowed techniques:")
    for t in plan.rules.allowed_techniques:
        lines.append(f"    ✓ {t}")
    lines.append("  Forbidden techniques:")
    for t in plan.rules.forbidden_techniques:
        lines.append(f"    ✗ {t}")
    lines.append("  Safety stops:")
    for s in plan.rules.safety_stops:
        lines.append(f"    ⚠ {s}")
    lines.append(f"  Escalation: {plan.rules.escalation_protocol}")
    lines.append(f"  Data handling: {plan.rules.data_handling}")
    lines.append(f"  Comms: {plan.rules.communication_channel}")
    lines.append("")

    lines.append("TEAMS")
    lines.append("-" * w)
    for t in plan.teams:
        lines.append(f"  {t.name} (#{t.team_id}) — Focus: {t.focus_area} — Level: {t.skill_level}")
        lines.append(f"    Objectives: {', '.join(t.assigned_objectives)}")
    lines.append("")

    lines.append("TIMELINE")
    lines.append("-" * w)
    for m in plan.timeline:
        marker = " ◆" if m.checkpoint else "  "
        lines.append(f"  {_format_time(m.time_offset_min):>5}{marker} {m.name}")
        if m.description:
            lines.append(f"         {m.description}")
    lines.append("")

    lines.append("SCORING")
    lines.append("-" * w)
    total = 0
    for rubric in plan.scoring:
        lines.append(f"  {rubric.category} (max {rubric.max_points} pts)")
        for name, pts in rubric.criteria:
            lines.append(f"    • {name}: {pts} pts")
        total += rubric.max_points
    lines.append(f"  {'─' * 40}")
    lines.append(f"  TOTAL POSSIBLE: {total} pts")
    lines.append("")

    lines.append("DEBRIEF QUESTIONS")
    lines.append("-" * w)
    for i, q in enumerate(plan.debrief_questions, 1):
        lines.append(f"  {i}. {q}")
    lines.append("")
    lines.append("=" * w)
    return "\n".join(lines)

`format_json(plan: RedTeamPlan) -> str` ¶

Format plan as JSON.

Source code in src/replication/red_team.py

def format_json(plan: RedTeamPlan) -> str:
    """Format plan as JSON."""
    def _to_dict(obj: Any) -> Any:
        if hasattr(obj, '__dataclass_fields__'):
            return {k: _to_dict(v) for k, v in asdict(obj).items()}
        if isinstance(obj, list):
            return [_to_dict(i) for i in obj]
        return obj
    return json.dumps(_to_dict(plan), indent=2)

`format_html(plan: RedTeamPlan) -> str` ¶

Generate self-contained HTML report.

Source code in src/replication/red_team.py

def format_html(plan: RedTeamPlan) -> str:
    """Generate self-contained HTML report."""
    _e = html_mod.escape
    obj_rows = ""
    for o in plan.objectives:
        criteria_html = "".join(f"<li>{_e(c)}</li>" for c in o.success_criteria)
        hints_html = f"<br><em>Hints: {_e('; '.join(o.hints))}</em>" if o.hints else ""
        diff_colors = {"easy": "#22c55e", "medium": "#eab308", "hard": "#f97316", "extreme": "#ef4444"}
        dc = diff_colors.get(o.difficulty, "#888")
        obj_rows += f"""<tr>
            <td><code>{_e(o.id)}</code></td>
            <td><strong>{_e(o.name)}</strong><br><small>{_e(o.description)}</small></td>
            <td><span style="color:{dc};font-weight:bold">{_e(o.difficulty)}</span></td>
            <td style="text-align:center"><strong>{o.points}</strong></td>
            <td><ul style="margin:0;padding-left:18px">{criteria_html}</ul>{hints_html}</td>
        </tr>"""

    timeline_html = ""
    for m in plan.timeline:
        style = "font-weight:bold;background:#1e293b" if m.checkpoint else ""
        marker = "◆" if m.checkpoint else "○"
        timeline_html += f'<tr style="{style}"><td>{_format_time(m.time_offset_min)}</td><td>{marker}</td><td>{_e(m.name)}</td><td>{_e(m.description)}</td></tr>'

    teams_html = ""
    for t in plan.teams:
        teams_html += f'<div style="background:#1e293b;padding:12px;border-radius:8px;margin:6px 0"><strong>{_e(t.name)}</strong> — {_e(t.focus_area)}<br><small>Level: {_e(t.skill_level)} | Objectives: {_e(", ".join(t.assigned_objectives))}</small></div>'

    allowed_html = "".join(f"<li>✓ {_e(t)}</li>" for t in plan.rules.allowed_techniques)
    forbidden_html = "".join(f"<li>✗ {_e(t)}</li>" for t in plan.rules.forbidden_techniques)
    stops_html = "".join(f"<li>⚠ {_e(s)}</li>" for s in plan.rules.safety_stops)

    scoring_html = ""
    total = 0
    for r in plan.scoring:
        items = "".join(f"<li>{_e(n)}: <strong>{p} pts</strong></li>" for n, p in r.criteria)
        scoring_html += f'<div style="margin:8px 0"><h4>{_e(r.category)} (max {r.max_points})</h4><ul>{items}</ul></div>'
        total += r.max_points

    debrief_html = "".join(f"<li>{_e(q)}</li>" for q in plan.debrief_questions)

    diff_badge = {"easy": "🟢", "medium": "🟡", "hard": "🟠", "extreme": "🔴"}.get(plan.difficulty, "⚪")

    return f"""<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"><meta name="viewport" content="width=device-width,initial-scale=1">
<title>Red Team Plan: {_e(plan.title)}</title>
<style>
*{{margin:0;padding:0;box-sizing:border-box}}
body{{font-family:system-ui,-apple-system,sans-serif;background:#0f172a;color:#e2e8f0;padding:24px;max-width:1000px;margin:0 auto;line-height:1.6}}
h1{{color:#f8fafc;margin-bottom:4px}} h2{{color:#38bdf8;margin:24px 0 12px;border-bottom:1px solid #334155;padding-bottom:6px}}
h3{{color:#94a3b8;margin:16px 0 8px}} h4{{color:#cbd5e1;margin:0}}
table{{width:100%;border-collapse:collapse;margin:8px 0}} th,td{{padding:8px 12px;text-align:left;border-bottom:1px solid #1e293b}}
th{{background:#1e293b;color:#94a3b8;font-size:0.85em;text-transform:uppercase}}
tr:hover{{background:#1e293b44}}
code{{background:#1e293b;padding:2px 6px;border-radius:4px;font-size:0.9em}}
ul{{margin:4px 0;padding-left:20px}} li{{margin:2px 0}}
.badge{{display:inline-block;padding:4px 12px;border-radius:12px;font-weight:bold;font-size:0.9em}}
.meta{{color:#64748b;font-size:0.9em;margin:4px 0}}
</style></head><body>
<h1>🎯 {_e(plan.title)}</h1>
<p class="meta">Plan ID: <code>{_e(plan.id)}</code> | {diff_badge} {_e(plan.difficulty.upper())} | Duration: {_format_time(plan.duration_minutes)} | Generated: {_e(plan.created[:19])}</p>
<p style="margin:12px 0">{_e(plan.description)}</p>

<h2>📋 Objectives</h2>
<table><thead><tr><th>ID</th><th>Objective</th><th>Difficulty</th><th>Points</th><th>Success Criteria</th></tr></thead><tbody>{obj_rows}</tbody></table>

<h2>⚖️ Rules of Engagement</h2>
<h3>Allowed</h3><ul>{allowed_html}</ul>
<h3>Forbidden</h3><ul style="color:#f87171">{forbidden_html}</ul>
<h3>Safety Stops</h3><ul style="color:#fbbf24">{stops_html}</ul>
<p><strong>Escalation:</strong> {_e(plan.rules.escalation_protocol)}</p>
<p><strong>Data handling:</strong> {_e(plan.rules.data_handling)}</p>
<p><strong>Comms:</strong> {_e(plan.rules.communication_channel)}</p>

<h2>👥 Teams</h2>{teams_html}

<h2>⏱️ Timeline</h2>
<table><thead><tr><th>Time</th><th></th><th>Milestone</th><th>Description</th></tr></thead><tbody>{timeline_html}</tbody></table>

<h2>🏆 Scoring</h2>{scoring_html}
<p style="font-size:1.2em;margin-top:12px"><strong>Total Possible: {total} pts</strong></p>

<h2>💬 Debrief Questions</h2><ol>{debrief_html}</ol>

<footer style="margin-top:32px;padding-top:12px;border-top:1px solid #334155;color:#475569;font-size:0.85em;text-align:center">
Generated by AI Replication Sandbox — Red Team Planner
</footer></body></html>"""