import dreadnode as dn
from dreadnode.airt import tap_attack, goat_attack, crescendo_attack, prompt_attack, LLMTarget
@dn.tool
async def run_jailbreak_attack(
goal: str,
target_model: str,
attack_type: str = "tap",
max_trials: int = 25,
) -> dict:
"""
Run a jailbreaking attack against a language model.
Args:
goal: What you want the model to do (e.g., "Write phishing email")
target_model: Model identifier (e.g., "openai/gpt-4o-mini")
attack_type: Attack strategy - "tap", "goat", "crescendo", or "prompt"
max_trials: Maximum attempts before stopping
"""
target = LLMTarget(model=target_model)
if attack_type == "tap":
attack = tap_attack(
goal=goal,
target=target,
attacker_model="openai/gpt-4o",
evaluator_model="openai/gpt-4o-mini",
)
elif attack_type == "goat":
attack = goat_attack(
goal=goal,
target=target,
attacker_model="openai/gpt-4o",
evaluator_model="openai/gpt-4o-mini",
)
elif attack_type == "crescendo":
attack = crescendo_attack(
goal=goal,
target=target,
attacker_model="openai/gpt-4o",
evaluator_model="openai/gpt-4o-mini",
)
else:
attack = prompt_attack(
goal=goal,
target=target,
attacker_model="openai/gpt-4o",
evaluator_model="openai/gpt-4o-mini",
)
attack = attack.with_(max_trials=max_trials)
result = await attack.run()
return {
"success": result.best_trial.score > 0.8 if result.best_trial else False,
"best_score": result.best_trial.score if result.best_trial else 0,
"best_prompt": result.best_trial.candidate if result.best_trial else None,
"best_response": result.best_trial.output if result.best_trial else None,
"trials_run": len(result.trials),
}