forked from mmistakes/minimal-mistakes
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
db55d48
commit dddcd03
Showing
6 changed files
with
345 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,339 @@ | ||
|
||
|
||
|
||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
|
||
|
||
|
||
<!-- Required meta tags --> | ||
<meta charset="utf-8" /> | ||
<meta | ||
name="viewport" | ||
content="width=device-width, initial-scale=1, shrink-to-fit=no" | ||
/> | ||
|
||
<link rel="stylesheet" href="static/css/main.css" type="text/css"/> | ||
<link rel="stylesheet" href="static/css/lazy_load.css" /> | ||
<link rel="stylesheet" href="static/css/typeahead.css" /> | ||
<link rel="icon" href="static/images/logo.png"> | ||
|
||
<!-- External Javascript libs --> | ||
<script src="https://cdn.jsdelivr.net/npm/d3@5/dist/d3.min.js"></script> | ||
|
||
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/handlebars.min.js" integrity="sha256-/PJBs6QWvXijOFIX04kZpLb6ZtSQckdOIavLWKKOgXU=" crossorigin="anonymous"></script> | ||
|
||
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script> | ||
<script src="https://kit.fontawesome.com/c59ce62110.js" crossorigin="anonymous"></script> | ||
|
||
<script | ||
src="https://cdn.jsdelivr.net/npm/[email protected]/dist/umd/popper.min.js" | ||
integrity="sha384-Q6E9RHvbIyZFJoft+2mJbHaEWldlvI9IOYy5n3zV9zzTtmI3UksdQRVvoxMfooAo" | ||
crossorigin="anonymous" | ||
></script> | ||
|
||
|
||
<script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.min.js" integrity="sha256-WqU1JavFxSAMcLP2WIOI+GB2zWmShMI82mTpLDcqFUg=" crossorigin="anonymous"></script> | ||
|
||
<script src="https://cdn.jsdelivr.net/npm/[email protected]/min/moment.min.js" integrity="sha256-4iQZ6BVL4qNKlQ27TExEhBN1HFPvAvAMbFavKKosSWQ=" crossorigin="anonymous"></script> | ||
|
||
<script src="https://cdn.jsdelivr.net/npm/[email protected]/builds/moment-timezone-with-data.min.js" integrity="sha256-IWYg4uIC8/erItNXYvLtyYHioRi2zT1TFva8qaAU/ww=" crossorigin="anonymous"></script> | ||
|
||
|
||
<!-- Library libs --> | ||
<script src="static/js/typeahead.bundle.js"></script> | ||
|
||
|
||
|
||
<!-- External CSS --> | ||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" integrity="sha256-YLGeXaapI0/5IgZopewRJcFXomhRMlYYjugPLSyNjTY=" crossorigin="anonymous"> | ||
|
||
<!-- External Fonts (no google for china) --> | ||
<link | ||
href="static/css/Lato.css" | ||
rel="stylesheet" | ||
/> | ||
<link href="static/css/Exo.css" rel="stylesheet" /> | ||
<link | ||
href="static/css/Cuprum.css" | ||
rel="stylesheet" | ||
/> | ||
|
||
<title>UCL DARK Lab: Rainbow Teaming: Open-Ended Generation of Diverse Adversarial Prompts</title> | ||
|
||
<meta name="citation_title" content="Rainbow Teaming: Open-Ended Generation of Diverse Adversarial Prompts" /> | ||
|
||
<meta name="citation_author" content="Mikayel Samvelyan" /> | ||
|
||
<meta name="citation_author" content="Sharath Chandra Raparthy" /> | ||
|
||
<meta name="citation_author" content="Andrei Lupu" /> | ||
|
||
<meta name="citation_author" content="Eric Hambro" /> | ||
|
||
<meta name="citation_author" content="Aram H. Markosyan" /> | ||
|
||
<meta name="citation_author" content="Manish Bhatt" /> | ||
|
||
<meta name="citation_author" content="Yuning Mao" /> | ||
|
||
<meta name="citation_author" content="Minqi Jiang" /> | ||
|
||
<meta name="citation_author" content="Jack Parker-Holder" /> | ||
|
||
<meta name="citation_author" content="Jakob Foerster" /> | ||
|
||
<meta name="citation_author" content="Tim Rocktäschel" /> | ||
|
||
<meta name="citation_author" content="Roberta Raileanu" /> | ||
|
||
<meta name="citation_publication_date" content="None" /> | ||
<meta name="citation_conference_title" content="Ucl Deciding, Acting, And Reasoning With Knowledge (Dark) Lab" /> | ||
<meta name="citation_inbook_title" content="None" /> | ||
<meta name="citation_abstract" content="As large language models (LLMs) become increasingly prevalent across many real-world applications, understanding and enhancing their robustness to adversarial attacks is of paramount importance. Existing methods for identifying adversarial prompts tend to focus on specific domains, lack diversity, or require extensive human annotations. To address these limitations, we present Rainbow Teaming, a novel black-box approach for producing a diverse collection of adversarial prompts. Rainbow Teaming casts adversarial prompt generation as a quality-diversity problem, and uses open-ended search to generate prompts that are both effective and diverse. Focusing on the safety domain, we use Rainbow Teaming to target various state-of-the-art LLMs, including the Llama 2 and Llama 3 models. Our approach reveals hundreds of effective adversarial prompts, with an attack success rate exceeding 90% across all tested models. Furthermore, we demonstrate that prompts generated by Rainbow Teaming are highly transferable and that fine-tuning models with synthetic data generated by our method significantly enhances their safety without sacrificing general performance or helpfulness. We additionally explore the versatility of Rainbow Teaming by applying it to question answering and cybersecurity, showcasing its potential to drive robust open-ended self-improvement in a wide range of applications." /> | ||
|
||
<meta name="citation_keywords" content="open-endednes" /> | ||
|
||
<meta name="citation_keywords" content="large language models" /> | ||
|
||
<meta name="citation_keywords" content="safety" /> | ||
|
||
<meta name="citation_keywords" content="diversity" /> | ||
|
||
<meta name="citation_pdf_url" content="https://arxiv.org/abs/2402.16822" /> | ||
|
||
|
||
</head> | ||
|
||
<body> | ||
<!-- NAV --> | ||
|
||
<nav | ||
class="navbar sticky-top navbar-expand-lg navbar-light bg-light mr-auto" | ||
id="main-nav" | ||
> | ||
<div class="container"> | ||
<!-- | ||
<a class="navbar-brand" href="index.html"> | ||
<img | ||
class="logo" src="static/images/logo.png" | ||
height="auto" | ||
width="130px" | ||
/> | ||
</a> | ||
--> | ||
|
||
<button | ||
class="navbar-toggler" | ||
type="button" | ||
data-toggle="collapse" | ||
data-target="#navbarNav" | ||
aria-controls="navbarNav" | ||
aria-expanded="false" | ||
aria-label="Toggle navigation" | ||
> | ||
<span class="navbar-toggler-icon"></span> | ||
</button> | ||
<div | ||
class="collapse navbar-collapse text-right flex-grow-1" | ||
id="navbarNav" | ||
> | ||
<ul class="navbar-nav ml-auto"> | ||
|
||
<li class="nav-item "> | ||
<a class="nav-link" href="index.html">Home</a> | ||
</li> | ||
|
||
<li class="nav-item "> | ||
<a class="nav-link" href="papers.html">Publications</a> | ||
</li> | ||
|
||
<li class="nav-item "> | ||
<a class="nav-link" href="speakers.html">Speakers</a> | ||
</li> | ||
|
||
<li class="nav-item "> | ||
<a class="nav-link" href="https://blog.ucldark.com/">Blog</a> | ||
</li> | ||
|
||
</ul> | ||
</div> | ||
</div> | ||
</nav> | ||
|
||
|
||
|
||
<!-- User Overrides --> | ||
|
||
|
||
<div class="container"> | ||
<!-- Tabs --> | ||
<div class="tabs"> | ||
|
||
</div> | ||
<!-- Content --> | ||
<div class="content"> | ||
|
||
|
||
<!-- Title --> | ||
<div class="pp-card m-3" style=""> | ||
<div class="card-header"> | ||
<h2 class="card-title main-title text-center" style=""> | ||
Rainbow Teaming: Open-Ended Generation of Diverse Adversarial Prompts | ||
</h2> | ||
<h3 class="card-subtitle mb-2 text-muted text-center"> | ||
|
||
<a href="papers.html?filter=authors&search=Mikayel Samvelyan" class="text-muted" | ||
>Mikayel Samvelyan</a | ||
>, | ||
|
||
<a href="papers.html?filter=authors&search=Sharath Chandra Raparthy" class="text-muted" | ||
>Sharath Chandra Raparthy</a | ||
>, | ||
|
||
<a href="papers.html?filter=authors&search=Andrei Lupu" class="text-muted" | ||
>Andrei Lupu</a | ||
>, | ||
|
||
<a href="papers.html?filter=authors&search=Eric Hambro" class="text-muted" | ||
>Eric Hambro</a | ||
>, | ||
|
||
<a href="papers.html?filter=authors&search=Aram H. Markosyan" class="text-muted" | ||
>Aram H. Markosyan</a | ||
>, | ||
|
||
<a href="papers.html?filter=authors&search=Manish Bhatt" class="text-muted" | ||
>Manish Bhatt</a | ||
>, | ||
|
||
<a href="papers.html?filter=authors&search=Yuning Mao" class="text-muted" | ||
>Yuning Mao</a | ||
>, | ||
|
||
<a href="papers.html?filter=authors&search=Minqi Jiang" class="text-muted" | ||
>Minqi Jiang</a | ||
>, | ||
|
||
<a href="papers.html?filter=authors&search=Jack Parker-Holder" class="text-muted" | ||
>Jack Parker-Holder</a | ||
>, | ||
|
||
<a href="papers.html?filter=authors&search=Jakob Foerster" class="text-muted" | ||
>Jakob Foerster</a | ||
>, | ||
|
||
<a href="papers.html?filter=authors&search=Tim Rocktäschel" class="text-muted" | ||
>Tim Rocktäschel</a | ||
>, | ||
|
||
<a href="papers.html?filter=authors&search=Roberta Raileanu" class="text-muted" | ||
>Roberta Raileanu</a | ||
> | ||
|
||
</h3> | ||
<p class="card-text text-center"> | ||
<span class="">Keywords:</span> | ||
|
||
<a | ||
href="papers.html?filter=keywords&search=open-endednes" | ||
class="text-secondary text-decoration-none" | ||
>open-endednes</a | ||
>, | ||
|
||
<a | ||
href="papers.html?filter=keywords&search=large language models" | ||
class="text-secondary text-decoration-none" | ||
>large language models</a | ||
>, | ||
|
||
<a | ||
href="papers.html?filter=keywords&search=safety" | ||
class="text-secondary text-decoration-none" | ||
>safety</a | ||
>, | ||
|
||
<a | ||
href="papers.html?filter=keywords&search=diversity" | ||
class="text-secondary text-decoration-none" | ||
>diversity</a | ||
> | ||
|
||
</p> | ||
<div class="text-center p-3"> | ||
|
||
<a class="card-link" data-toggle="collapse" role="button" href="#details"> | ||
Abstract | ||
</a> | ||
|
||
<a class="card-link" target="_blank" href="https://arxiv.org/abs/2402.16822"> | ||
Paper | ||
</a> | ||
|
||
</div> | ||
</div> | ||
</div> | ||
|
||
<div id="details" class="pp-card m-3"> | ||
<div class="card-body"> | ||
<div class="card-text"> | ||
<div id="abstractExample"> | ||
<span class="font-weight-bold">Abstract:</span> | ||
As large language models (LLMs) become increasingly prevalent across many real-world applications, understanding and enhancing their robustness to adversarial attacks is of paramount importance. Existing methods for identifying adversarial prompts tend to focus on specific domains, lack diversity, or require extensive human annotations. To address these limitations, we present Rainbow Teaming, a novel black-box approach for producing a diverse collection of adversarial prompts. Rainbow Teaming casts adversarial prompt generation as a quality-diversity problem, and uses open-ended search to generate prompts that are both effective and diverse. Focusing on the safety domain, we use Rainbow Teaming to target various state-of-the-art LLMs, including the Llama 2 and Llama 3 models. Our approach reveals hundreds of effective adversarial prompts, with an attack success rate exceeding 90% across all tested models. Furthermore, we demonstrate that prompts generated by Rainbow Teaming are highly transferable and that fine-tuning models with synthetic data generated by our method significantly enhances their safety without sacrificing general performance or helpfulness. We additionally explore the versatility of Rainbow Teaming by applying it to question answering and cybersecurity, showcasing its potential to drive robust open-ended self-improvement in a wide range of applications. | ||
</div> | ||
</div> | ||
<p></p> | ||
</div> | ||
</div> | ||
|
||
|
||
|
||
</div> | ||
</div> | ||
|
||
|
||
|
||
<!-- Google Analytics --> | ||
<script | ||
async | ||
src="https://www.googletagmanager.com/gtag/js?id=UA-" | ||
></script> | ||
<script> | ||
window.dataLayer = window.dataLayer || []; | ||
function gtag() { | ||
dataLayer.push(arguments); | ||
} | ||
gtag("js", new Date()); | ||
gtag("config", "UA-"); | ||
</script> | ||
|
||
<!-- Footer --> | ||
<footer class="footer bg-light p-4"> | ||
<div class="container"> | ||
<p class="float-right"><a href="#">Back to Top</a></p> | ||
<p class="text-center">© 2020 UCL DARK Lab</p> | ||
</div> | ||
</footer> | ||
|
||
<!-- Code for hash tags --> | ||
<script type="text/javascript"> | ||
$(document).ready(function () { | ||
if (location.hash !== "") { | ||
$('a[href="' + location.hash + '"]').tab("show"); | ||
} | ||
|
||
$("a[data-toggle='tab']").on("shown.bs.tab", function (e) { | ||
var hash = $(e.target).attr("href"); | ||
if (hash.substr(0, 1) == "#") { | ||
var position = $(window).scrollTop(); | ||
location.replace("#" + hash.substr(1)); | ||
$(window).scrollTop(position); | ||
} | ||
}); | ||
}); | ||
</script> | ||
<script src="static/js/lazy_load.js"></script> | ||
|
||
</body> | ||
</html> |
Oops, something went wrong.