_bibliography/papers.bib

---
---
@inproceedings{bhagoji_enhancing_2018,
  abbr        = {CISS},
  abstract    = {We propose the use of data transformations as a defense against evasion attacks on ML classifiers. We present and investigate strategies for incorporating a variety of data transformations including dimensionality reduction via Principal Component Analysis and data "anti-whitening" to enhance the resilience of machine learning, targeting both the classification and the training phase. We empirically evaluate and demonstrate the feasibility of linear transformations of data as a defense mechanism against evasion attacks using multiple real-world datasets. Our key findings are that the defense is (i) effective against the best known evasion attacks from the literature, resulting in a two-fold increase in the resources required by a white-box adversary with knowledge of the defense for a successful attack, (ii) applicable across a range of ML classifiers, including Support Vector Machines and Deep Neural Networks, and (iii) generalizable to multiple application domains, including image classification and human activity classification.},
  author      = {Bhagoji, Arjun Nitin and Cullina, Daniel and Sitawarin, Chawin and Mittal, Prateek},
  bibtex_show = {false},
  booktitle   = {52nd Annual Conference on Information Sciences and Systems ({{CISS}})},
  code        = {https://github.com/inspire-group/ml_defense},
  copyright   = {All rights reserved},
  doi         = {10.1109/CISS.2018.8362326},
  pages       = {1--5},
  pdf         = {https://arxiv.org/abs/1704.02654},
  title       = {Enhancing Robustness of Machine Learning Systems via Data Transformations},
  year        = {2018}
}

@inproceedings{lin_inversedesigned_2016,
  abbr        = {CLEO},
  abstract    = {\textcopyright{} 2016 OSA. We describe a large-scale computational approach based on topology optimization that enables automatic discovery of novel nonlinear photonic structures. As examples, we design complex cavity and fiber geometries that can achieve high-efficiency nonlinear frequency conversion.},
  author      = {Lin, Zin and Sitawarin, Chawin and Loncar, Marko and Rodriguez, Alejandro W.},
  bibtex_show = {false},
  booktitle   = {2016 {{Conference}} on {{Lasers}} and {{Electro}}-{{Optics}}, {{CLEO}} 2016},
  copyright   = {Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License (CC-BY-NC-ND)},
  isbn        = {978-1-943580-11-8},
  pdf         = {http://ieeexplore.ieee.org/document/7788596/},
  title       = {Inverse-Designed Nonlinear Nanophotonic Structures: Enhanced Frequency Conversion at the Nano Scale},
  year        = {2016}
}

@article{martinez_grand_2017,
  abbr          = {Preprint},
  abstract      = {As an initial assessment, over 480,000 labeled virtual images of normal highway driving were readily generated in Grand Theft Auto V's virtual environment. Using these images, a CNN was trained to detect following distance to cars/objects ahead, lane markings, and driving angle (angular heading relative to lane centerline): all variables necessary for basic autonomous driving. Encouraging results were obtained when tested on over 50,000 labeled virtual images from substantially different GTA-V driving environments. This initial assessment begins to define both the range and scope of the labeled images needed for training as well as the range and scope of labeled images needed for testing the definition of boundaries and limitations of trained networks. It is the efficacy and flexibility of a "GTA-V"-like virtual environment that is expected to provide an efficient well-defined foundation for the training and testing of Convolutional Neural Networks for safe driving. Additionally, described is the Princeton Virtual Environment (PVE) for the training, testing and enhancement of safe driving AI, which is being developed using the video-game engine Unity. PVE is being developed to recreate rare but critical corner cases that can be used in re-training and enhancing machine learning models and understanding the limitations of current self driving models. The Florida Tesla crash is being used as an initial reference.},
  archiveprefix = {arXiv},
  author        = {Martinez, Mark Anthony and Sitawarin, Chawin and Finch, Kevin and Meincke, Lennart and Yablonski, Alexander and Kornhauser, Alain},
  bibtex_show   = {true},
  copyright     = {Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License (CC-BY-NC-ND)},
  eprint        = {1712.01397},
  eprinttype    = {arxiv},
  journal       = {arXiv:1712.01397 [cs]},
  month         = dec,
  pdf           = {https://dl.acm.org/citation.cfm?id=3278515},
  primaryclass  = {cs},
  title         = {Beyond Grand Theft Auto v for Training, Testing and Enhancing Deep Learning in Self Driving Cars},
  year          = {2017}
}

@inproceedings{sehwag_analyzing_2019,
  abbr        = {AISec},
  abstract    = {When deploying machine learning models in real-world applications, an open-world learning framework is needed to deal with both normal in-distribution inputs and undesired out-of-distribution (OOD) inputs. Open-world learning frameworks include OOD detectors that aim to discard input examples which are not from the same distribution as the training data of machine learning classifiers. However, our understanding of current OOD detectors is limited to the setting of benign OOD data, and an open question is whether they are robust in the presence of adversaries. In this paper, we present the first analysis of the robustness of open-world learning frameworks in the presence of adversaries by introducing and designing \o odAdvExamples. Our experimental results show that current OOD detectors can be easily evaded by slightly perturbing benign OOD inputs, revealing a severe limitation of current open-world learning frameworks. Furthermore, we find that \o odAdvExamples also pose a strong threat to adversarial training based defense methods in spite of their effectiveness against in-distribution adversarial attacks. To counteract these threats and ensure the trustworthy detection of OOD inputs, we outline a preliminary design for a robust open-world machine learning framework.},
  address     = {{New York, NY, USA}},
  author      = {Sehwag, Vikash and Bhagoji, Arjun Nitin and Song, Liwei and Sitawarin, Chawin and Cullina, Daniel and Chiang, Mung and Mittal, Prateek},
  bibtex_show = {false},
  booktitle   = {Proceedings of the 12th {{ACM}} Workshop on Artificial Intelligence and Security},
  code        = {https://github.com/inspire-group/OOD-Attacks},
  copyright   = {All rights reserved},
  doi         = {10.1145/3338501.3357372},
  isbn        = {978-1-4503-6833-9},
  keywords    = {adversarial example,deep learning,open world recognition},
  pages       = {105--116},
  pdf         = {https://dl.acm.org/doi/pdf/10.1145/3338501.3357372},
  publisher   = {{Association for Computing Machinery}},
  series      = {{{AISec}}'19},
  title       = {Analyzing the Robustness of Open-World Machine Learning},
  year        = {2019},
  preview     = {ood_thumbnail.png}
}

@inproceedings{sehwag_not_2018,
  abbr        = {CCS},
  abstract    = {Deep neural networks (DNNs) have enabled success in learning tasks such as image classification, semantic image segmentation and steering angle prediction which can be key components of the computer vision pipeline of safety-critical systems such as autonomous vehicles. However, previous work has demonstrated the feasibility of using physical adversarial examples to attack image classification systems.},
  address     = {{Toronto Canada}},
  annotation  = {4 citations (Semantic Scholar/DOI) [2021-06-11] 0 citations (Crossref) [2021-06-11] 00000},
  author      = {Sehwag, Vikash and Sitawarin, Chawin and Bhagoji, Arjun Nitin and Mosenia, Arsalan and Chiang, Mung and Mittal, Prateek},
  bibtex_show = {false},
  booktitle   = {Proceedings of the 2018 {{ACM SIGSAC Conference}} on {{Computer}} and {{Communications Security}}},
  copyright   = {Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License (CC-BY-NC-ND)},
  doi         = {10.1145/3243734.3278515},
  isbn        = {978-1-4503-5693-0},
  language    = {en},
  month       = oct,
  pages       = {2285--2287},
  pdf         = {https://dl.acm.org/citation.cfm?id=3278515},
  publisher   = {{ACM}},
  shorttitle  = {Not All Pixels Are Born Equal},
  title       = {Not All Pixels Are Born Equal: An Analysis of Evasion Attacks under Locality Constraints},
  year        = {2018}
}

@inproceedings{sitawarin_adversarial_2021,
  abbr        = {NeurIPS},
  author      = {Sitawarin, Chawin and Kornaropoulos, Evgenios M and Song, Dawn and Wagner, David},
  abstract    = {Adversarial examples are a widely studied phenomenon in machine learning models. While most of the attention has been focused on neural networks, other practical models also suffer from this issue. In this work, we propose an algorithm for evaluating the adversarial robustness of $k$-nearest neighbor classification, i.e., finding a minimum-norm adversarial example. Diverging from previous proposals, we propose the first geometric approach by performing a search that expands outwards from a given input point. On a high level, the search radius expands to the nearby higher-order Voronoi cells until we find a cell that classifies differently from the input point. To scale the algorithm to a large $k$, we introduce approximation steps that find perturbation with smaller norm, compared to the baselines, in a variety of datasets. Furthermore, we analyze the structural properties of a dataset where our approach outperforms the competition.},
  bibtex_show = {true},
  booktitle   = {Advances in Neural Information Processing Systems},
  pdf         = {https://openreview.net/forum?id=2j3B_YkC8r},
  publisher   = {{Curran Associates, Inc.}},
  editor      = {A. Beygelzimer and Y. Dauphin and P. Liang and J. Wortman Vaughan},
  title       = {Adversarial Examples for k-Nearest Neighbor Classifiers Based on Higher-Order Voronoi Diagrams},
  volume      = {34},
  year        = {2021},
  preview     = {knn_neurips2021_thumbnail.png},
  arxiv       = {2011.09719},
  code        = {https://github.com/wagner-group/geoadex}
}

@article{sitawarin_darts_2018,
  abbr          = {Preprint},
  abstract      = {Sign recognition is an integral part of autonomous cars. Any misclassification of traffic signs can potentially lead to a multitude of disastrous consequences, ranging from a life-threatening accident to even a large-scale interruption of transportation services relying on autonomous cars. In this paper, we propose and examine security attacks against sign recognition systems for Deceiving Autonomous caRs with Toxic Signs (we call the proposed attacks DARTS). In particular, we introduce two novel methods to create these toxic signs. First, we propose Out-of-Distribution attacks, which expand the scope of adversarial examples by enabling the adversary to generate these starting from an arbitrary point in the image space compared to prior attacks which are restricted to existing training/test data (In-Distribution). Second, we present the Lenticular Printing attack, which relies on an optical phenomenon to deceive the traffic sign recognition system. We extensively evaluate the effectiveness of the proposed attacks in both virtual and real-world settings and consider both white-box and black-box threat models. Our results demonstrate that the proposed attacks are successful under both settings and threat models. We further show that Out-of-Distribution attacks can outperform In-Distribution attacks on classifiers defended using the adversarial training defense, exposing a new attack vector for these defenses.},
  archiveprefix = {arXiv},
  author        = {Sitawarin, Chawin and Bhagoji, Arjun Nitin and Mosenia, Arsalan and Chiang, Mung and Mittal, Prateek},
  bibtex_show   = {true},
  code          = {https://github.com/inspire-group/advml-traffic-sign},
  copyright     = {All rights reserved},
  eprint        = {1802.06430},
  eprinttype    = {arxiv},
  journal       = {arXiv:1802.06430 [cs]},
  month         = may,
  pdf           = {https://arxiv.org/abs/1802.06430},
  primaryclass  = {cs},
  shorttitle    = {{{DARTS}}},
  title         = {{{DARTS}}: Deceiving Autonomous Cars with Toxic Signs},
  year          = {2018}
}

@article{sitawarin_defending_2019,
  abbr          = {Preprint},
  abstract      = {(We took the paper down from arXiv because the defense is broken by our new attack. The paper is still available [here](https://drive.google.com/file/d/1_3SjKi92mfCRAg99EXEJXpOpGCCw2OXN/view?usp=sharing)) Robustness is an increasingly important property of machine learning models as they become more and more prevalent. We propose a defense against adversarial examples based on a k-nearest neighbor (kNN) on the intermediate activation of neural networks. Our scheme surpasses state-of-the-art defenses on MNIST and CIFAR-10 against l2-perturbation by a significant margin. With our models, the mean perturbation norm required to fool our MNIST model is 3.07 and 2.30 on CIFAR-10. Additionally, we propose a simple certifiable lower bound on the l2-norm of the adversarial perturbation using a more specific version of our scheme, a 1-NN on representations learned by a Lipschitz network. Our model provides a nontrivial average lower bound of the perturbation norm, comparable to other schemes on MNIST with similar clean accuracy.},
  archiveprefix = {arXiv},
  author        = {Sitawarin, Chawin and Wagner, David},
  bibtex_show   = {true},
  code          = {https://github.com/chawins/knn-defense},
  copyright     = {Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License (CC-BY-NC-ND)},
  eprint        = {1906.09525},
  eprinttype    = {arxiv},
  journal       = {arXiv:1906.09525 [cs]},
  month         = jun,
  pdf           = {https://arxiv.org/abs/1906.09525},
  primaryclass  = {cs},
  title         = {Defending against Adversarial Examples with K-Nearest Neighbor},
  year          = {2019},
  arxiv         = {1906.09525},
  preview       = {defending_thumbnail.png}
}

@inproceedings{sitawarin_demystifying_2022,
  abbr        = {ICML},
  abstract    = {Neural networks' lack of robustness against attacks raises concerns in security-sensitive settings
                 such as autonomous vehicles. While many countermeasures may look promising, only a few withstand rigorous evaluation.
                 Defenses using random transformations (RT) have shown impressive results, particularly BaRT (Raff et al., 2019) on ImageNet.
                 However, this type of defense has not been rigorously evaluated, leaving its robustness properties poorly understood.
                 Their stochastic properties make evaluation more challenging and render many proposed attacks on deterministic models inapplicable.
                 First, we show that the BPDA attack (Athalye et al., 2018a) used in BaRT’s evaluation is ineffective and likely over-estimates its robustness.
                 We then attempt to construct the strongest possible RT defense through the informed selection of transformations and
                 Bayesian optimization for tuning their parameters. Furthermore, we create the strongest possible attack to evaluate our RT defense.
                 Our new attack vastly outperforms the baseline, reducing the accuracy by 83% compared to the 19% reduction by the commonly
                 used EoT attack (4.3x improvement). Our result indicates that the RT defense on Imagenette dataset (ten-class subset of ImageNet)
                 is not robust against adversarial examples. Extending the study further, we use our new attack to adversarially train RT defense
                 (called AdvRT), resulting in a large robustness gain.},
  arxiv       = {2207.03574},
  author      = {Chawin Sitawarin and Zachary Golan-Strieb and David Wagner},
  bibtex_show = {true},
  booktitle   = {Proceedings of the 39th International Conference on Machine Learning},
  pdf         = {https://proceedings.mlr.press/v162/sitawarin22a/sitawarin22a.pdf},
  slides      = {demysify_random_transform_icml22_slides.pdf},
  poster      = {demysify_random_transform_icml22_poster.pdf},
  talk        = {https://slideslive.com/38983632/demystifying-the-adversarial-robustness-of-random-transformation-defenses},
  code        = {https://github.com/wagner-group/demystify-random-transform},
  title       = {Demystifying the Adversarial Robustness of Random Transformation Defenses},
  url         = {https://proceedings.mlr.press/v162/sitawarin22a/sitawarin22a.pdf},
  year        = {2022},
  preview     = {demysify_icml2021_thumbnail.png},
  note        = {Also appeared in AAAI-2022 Workshop on Adversarial Machine Learning and Beyond},
  award       = {Best Paper Award from AAAI-2022 Workshop on Adversarial Machine Learning and Beyond},
  award_name  = {Best Workshop Paper}
}

@inproceedings{sitawarin_improving_2021,
  title       = {Improving the Accuracy-Robustness Trade-off for Dual-Domain Adversarial Training},
  abbr        = {Workshop},
  abstract    = {While Adversarial Training remains the standard in improving robustness to adversarial attack, it often sacrifices accuracy on natural (clean) samples to a significant extent. Dual-domain training, optimizing on both clean and adversarial objectives, can help realize a better trade-off between clean accuracy and robustness. In this paper, we develop methods to improve dual-domain training for large adversarial perturbations and complex datasets. We first demonstrate that existing methods suffer from poor performance in this setting, due to a poor training procedure and overfitting to a particular attack. Then, we develop novel methods to address these issues. First, we show that adding KLD regularization to the dual training objective mitigates this overfitting and achieves a better trade-off, on CIFAR-10 and a 10-class subset of ImageNet. Then, inspired by domain adaptation, we develop a new normalization technique, Dual Batch Normalization, to further improve accuracy. Combining these two strategies, our model sets a new state of the art in trade-off performance for dual-domain adversarial training.},
  author      = {Sitawarin, Chawin and Sridhar, Arvind P and Wagner, David},
  bibtex_show = {false},
  booktitle   = {Workshop on {{Uncertainty}} and {{Robustness}} in {{Deep Learning}}},
  copyright   = {Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License (CC-BY-NC-ND)},
  language    = {en},
  month       = jul,
  pages       = {10},
  pdf         = {https://chawins.github.io/assets/pdf/UDL2021-paper-048.pdf},
  year        = {2021},
  preview     = {tradeoff_urdl2021_thumbnail.png},
  code        = {https://github.com/wagner-group/dual-domain-at}
}

@article{sitawarin_inversedesigned_2018,
  abbr        = {Photon. Res.},
  abstract    = {Typically, photonic waveguides designed for nonlinear frequency conversion rely on intuitive and established principles, including index guiding and bandgap engineering, and are based on simple shapes with high degrees of symmetry. We show that recently developed inverse-design techniques can be applied to discover new kinds of microstructured fibers and metasurfaces designed to achieve large nonlinear frequency-conversion efficiencies. As a proof of principle, we demonstrate complex, wavelength-scale chalcogenide glass fibers and gallium phosphide three-dimensional metasurfaces exhibiting some of the largest nonlinear conversion efficiencies predicted thus far, e.g.,\&\#x00A0;lowering the power requirement for third-harmonic generation by 104 and enhancing second-harmonic generation conversion efficiency by 107. Such enhancements arise because, in addition to enabling a great degree of tunability in the choice of design wavelengths, these optimization tools ensure both frequency- and phase-matching in addition to large nonlinear overlap factors.},
  arxiv       = {1711.07810},
  author      = {Chawin Sitawarin and Weiliang Jin and Zin Lin and Alejandro W. Rodriguez},
  bibtex_show = {false},
  doi         = {10.1364/PRJ.6.000B82},
  journal     = {Photon. Res.},
  keywords    = {Nonlinear optics, fibers; Harmonic generation and mixing ; Nonlinear optics, devices; Computational electromagnetic methods ; Nanophotonics and photonic crystals ; Chalcogenide fibers; Harmonic generation; Light matter interactions; Microstructured fibers; Phase matching; Second harmonic generation},
  month       = {May},
  number      = {5},
  pages       = {B82--B89},
  pdf         = {https://www.osapublishing.org/prj/fulltext.cfm?uri=prj-6-5-B82&id=385779},
  publisher   = {OSA},
  title       = {Inverse-designed photonic fibers and metasurfaces for nonlinear frequency conversion (Invited)},
  url         = {http://www.osapublishing.org/prj/abstract.cfm?URI=prj-6-5-B82},
  volume      = {6},
  year        = {2018},
  pdf         = {https://opg.optica.org/prj/fulltext.cfm?uri=prj-6-5-B82&id=385779},
  preview     = {photores_thumbnail.png}
}

@inproceedings{sitawarin_minimumnorm_2020,
  abbr        = {DLS},
  abstract    = {We study the robustness against adversarial examples of kNN classifiers and classifiers that combine kNN with neural networks. The main difficulty lies in the fact that finding an optimal attack on kNN is intractable for typical datasets. In this work, we propose a gradient-based attack on kNN and kNN-based defenses, inspired by the previous work by Sitawarin & Wagner [1]. We demonstrate that our attack outperforms their method on all of the models we tested with only a minimal increase in the computation time. The attack also beats the state-of-the-art attack [2] on kNN when k > 1 using less than 1% of its running time. We hope that this attack can be used as a new baseline for evaluating the robustness of kNN and its variants.},
  address     = {{Los Alamitos, CA, USA}},
  author      = {Sitawarin, Chawin and Wagner, David},
  bibtex_show = {false},
  booktitle   = {2020 {{IEEE}} Security and Privacy Workshops ({{SPW}})},
  code        = {https://github.com/chawins/knn-defense},
  copyright   = {All rights reserved},
  doi         = {10.1109/SPW50608.2020.00023},
  keywords    = {computational modeling,conferences,data privacy,neural networks,robustness,security},
  month       = may,
  pages       = {34--40},
  arxiv       = {2003.06559},
  pdf         = {https://ieeexplore.ieee.org/document/9283888},
  publisher   = {{IEEE Computer Society}},
  talk        = {https://youtu.be/4YNoJQ0ptGE?t=2699},
  title       = {Minimum-Norm Adversarial Examples on {{KNN}} and {{KNN}} Based Models},
  year        = {2020},
  preview     = {minnorm_thumbnail.png}
}

@inproceedings{sitawarin_robustness_2019,
  abbr        = {DLS},
  abstract    = {Despite a large amount of attention on adversarial examples, very few works have demonstrated an effective defense against this threat. We examine Deep k-Nearest Neighbor (DkNN), a proposed defense that combines k-Nearest Neighbor (kNN) and deep learning to improve the model's robustness to adversarial examples. It is challenging to evaluate the robustness of this scheme due to a lack of efficient algorithm for attacking kNN classifiers with large k and high-dimensional data. We propose a heuristic attack that allows us to use gradient descent to find adversarial examples for kNN classifiers, and then apply it to attack the DkNN defense as well. Results suggest that our attack is moderately stronger than any naive attack on kNN and significantly outperforms other attacks on DkNN.},
  address     = {{Los Alamitos, CA, USA}},
  author      = {Sitawarin, Chawin and Wagner, David},
  bibtex_show = {false},
  booktitle   = {2019 {{IEEE}} Security and Privacy Workshops ({{SPW}})},
  copyright   = {All rights reserved},
  doi         = {10.1109/SPW.2019.00014},
  keywords    = {adaptation models,deep learning,neural networks,optimization,perturbation methods,robustness,training},
  month       = may,
  pages       = {1--7},
  pdf         = {https://arxiv.org/abs/1903.08333},
  publisher   = {{IEEE Computer Society}},
  talk        = {https://youtu.be/PmgKS3zckx8},
  title       = {On the Robustness of Deep K-Nearest Neighbors},
  year        = {2019},
  arxiv       = {1903.08333},
  preview     = {deepknn_thumbnail.png},
  code        = {https://github.com/chawins/knn-defense}
}

@article{sitawarin_rogue_2018,
  abbr          = {DLS},
  abstract      = {We propose a new real-world attack against the computer vision based systems of autonomous vehicles (AVs). Our novel Sign Embedding attack exploits the concept of adversarial examples to modify innocuous signs and advertisements in the environment such that they are classified as the adversary's desired traffic sign with high confidence. Our attack greatly expands the scope of the threat posed to AVs since adversaries are no longer restricted to just modifying existing traffic signs as in previous work. Our attack pipeline generates adversarial samples which are robust to the environmental conditions and noisy image transformations present in the physical world. We ensure this by including a variety of possible image transformations in the optimization problem used to generate adversarial samples. We verify the robustness of the adversarial samples by printing them out and carrying out drive-by tests simulating the conditions under which image capture would occur in a real-world scenario. We experimented with physical attack samples for different distances, lighting conditions and camera angles. In addition, extensive evaluations were carried out in the virtual setting for a variety of image transformations. The adversarial samples generated using our method have adversarial success rates in excess of 95\% in the physical as well as virtual settings.},
  archiveprefix = {arXiv},
  author        = {Sitawarin, Chawin and Bhagoji, Arjun Nitin and Mosenia, Arsalan and Mittal, Prateek and Chiang, Mung},
  bibtex_show   = {true},
  code          = {https://github.com/inspire-group/advml-traffic-sign},
  copyright     = {All rights reserved},
  eprint        = {1801.02780},
  eprinttype    = {arxiv},
  journal       = {arXiv:1801.02780 [cs]},
  month         = mar,
  pdf           = {https://arxiv.org/abs/1801.02780},
  primaryclass  = {cs},
  shorttitle    = {Rogue Signs},
  title         = {Rogue Signs: Deceiving Traffic Sign Recognition with Malicious Ads and Logos},
  year          = {2018}
}

@inproceedings{sitawarin_sat_2021,
  abbr        = {AISec},
  title       = {SAT: Improving Adversarial Training via Curriculum-Based Loss Smoothing},
  abstract    = {Adversarial training (AT) has become a popular choice for training robust networks.
                 However, it tends to sacrifice clean accuracy heavily in favor of robustness and suffers
                 from a large generalization error. To address these concerns, we propose Smooth Adversarial
                 Training (SAT), guided by our analysis on the eigenspectrum of the loss Hessian. We
                 find that curriculum learning, a scheme that emphasizes on starting "easy'' and gradually
                 ramping up on the "difficulty'' of training, smooths the adversarial loss landscape
                 for a suitably chosen difficulty metric. We present a general formulation for curriculum
                 learning in the adversarial setting and propose two difficulty metrics based on the
                 maximal Hessian eigenvalue (H-SAT) and the softmax probability (P-SA). We demonstrate
                 that SAT stabilizes network training even for a large perturbation norm and allows
                 the network to operate at a better clean accuracy versus robustness trade-off curve
                 compared to AT. This leads to a significant improvement in both clean accuracy and
                 robustness compared to AT, TRADES, and other baselines. To highlight a few results,
                 our best model improves normal and robust accuracy by 6% and 1% on CIFAR-100 compared
                 to AT, respectively. On Imagenette, a ten-class subset of ImageNet, our model outperforms
                 AT by 23% and 3% on normal and robust accuracy respectively.},
  address     = {New York, NY, USA},
  arxiv       = {2003.09347},
  author      = {Sitawarin, Chawin and Chakraborty, Supriyo and Wagner, David},
  bibtex_show = {true},
  booktitle   = {Proceedings of the 14th ACM Workshop on Artificial Intelligence and Security},
  doi         = {10.1145/3474369.3486878},
  isbn        = {9781450386579},
  keywords    = {adversarial examples, adversarial machine learning, curriculum learning},
  location    = {Virtual Event, Republic of Korea},
  numpages    = {12},
  pages       = {25–36},
  pdf         = {https://dl.acm.org/doi/abs/10.1145/3474369.3486878},
  publisher   = {Association for Computing Machinery},
  series      = {AISec '21},
  slides      = {aisec_2021.pdf},
  url         = {https://doi.org/10.1145/3474369.3486878},
  year        = {2021},
  preview     = {sat_aisec2019_thumbnail.png}
}

@inproceedings{sridhar_mitigating_2021,
  abbr        = {Workshop},
  abstract    = {The adversarial training paradigm has become the standard in training deep neural networks for robustness. Yet, it remains unstable, with the mechanisms driving this instability poorly understood. In this study, we discover that this instability is primarily driven by a non-smooth optimization landscape and an internal covariate shift phenomenon, and show that Batch Normalization (BN) can effectively mitigate both these issues. Further, we demonstrate that BN universally improves clean and robust performance across various defenses, datasets, and model types, with greater improvement on more difficult tasks. Finally, we confirm BN's heterogeneous distribution issue with mixed-batch training and propose a solution.},
  author      = {Sridhar, Arvind P and Sitawarin, Chawin and Wagner, David},
  bibtex_show = {false},
  booktitle   = {Security and {{Safety}} in {{Machine Learning Systems Workshop}}},
  copyright   = {Creative Commons Attribution-NonCommercial-NoDerivatives 4.0 International License (CC-BY-NC-ND)},
  language    = {en},
  month       = may,
  pages       = {13},
  pdf         = {https://aisecure-workshop.github.io/aml-iclr2021/papers/43.pdf},
  title       = {Mitigating Adversarial Training Instability with Batch Normalization},
  year        = {2021},
  preview     = {mitigating_thumbnail.png}
}

@inproceedings{hingun_reap_2023,
  abbr         = {ICCV},
  abstract     = {Machine learning models are known to be susceptible to adversarial perturbation. One famous attack is the adversarial patch, a sticker with a particularly crafted pattern that makes the model incorrectly predict the object it is placed on. This attack presents a critical threat to cyber-physical systems that rely on cameras such as autonomous cars. Despite the significance of the problem, conducting research in this setting has been difficult; evaluating attacks and defenses in the real world is exceptionally costly while synthetic data are unrealistic. In this work, we propose the REAP (REalistic Adversarial Patch) Benchmark, a digital benchmark that allows the user to evaluate patch attacks on real images, and under real-world conditions. Built on top of the Mapillary Vistas dataset, our benchmark contains over 14,000 traffic signs. Each sign is augmented with a pair of geometric and lighting transformations, which can be used to apply a digitally generated patch realistically onto the sign, while matching real-world conditions. Using our benchmark, we perform the first large-scale assessments of adversarial patch attacks under realistic conditions. Our experiments suggest that adversarial patch attacks may present a smaller threat than previously believed and that the success rate of an attack on simpler digital simulations is not predictive of its actual effectiveness in practice.},
  author       = {Hingun{${*}$}, Nabeel and Sitawarin{${*}$}, Chawin and Li, Jerry and Wagner, David},
  bibtex_show  = {true},
  arxiv        = {2212.05680},
  primaryclass = {cs},
  title        = {REAP: A Large-Scale Realistic Adversarial Patch Benchmark},
  booktitle    = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  month        = {October},
  year         = {2023},
  pages        = {4640-4651},
  code         = {https://github.com/wagner-group/reap-benchmark},
  preview      = {reap_thumbnail.png},
  pdf          = {https://openaccess.thecvf.com/content/ICCV2023/papers/Hingun_REAP_A_Large-Scale_Realistic_Adversarial_Patch_Benchmark_ICCV_2023_paper.pdf},
  poster       = {reap_poster_iccv2023.pdf}
}

@inproceedings{sitawarin_preprocessors_2023,
  abbr        = {ICML},
  title       = {Preprocessors Matter! {{Realistic}} Decision-Based Attacks on Machine Learning Systems},
  booktitle   = {Proceedings of the 40th International Conference on Machine Learning},
  author      = {Sitawarin, Chawin and Tram{\`e}r, Florian and Carlini, Nicholas},
  editor      = {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan},
  year        = {2023},
  month       = jul,
  series      = {Proceedings of Machine Learning Research},
  volume      = {202},
  pages       = {32008--32032},
  publisher   = {{PMLR}},
  url         = {https://proceedings.mlr.press/v202/sitawarin23a.html},
  abstract    = {Decision-based attacks construct adversarial examples against a machine learning (ML) model by making only hard-label queries. These attacks have mainly been applied directly to standalone neural networks. However, in practice, ML models are just one component of a larger learning system. We find that by adding a single preprocessor in front of a classifier, state-of-the-art query-based attacks are up to seven\texttimes{} less effective at attacking a prediction pipeline than at attacking the model alone. We explain this discrepancy by the fact that most preprocessors introduce some notion of invariance to the input space. Hence, attacks that are unaware of this invariance inevitably waste a large number of queries to re-discover or overcome it. We, therefore, develop techniques to (i) reverse-engineer the preprocessor and then (ii) use this extracted information to attack the end-to-end system. Our preprocessors extraction method requires only a few hundred queries, and our preprocessor-aware attacks recover the same efficacy as when attacking the model alone. The code can be found at https://github.com/google-research/preprocessor-aware-black-box-attack.},
  copyright   = {All rights reserved},
  code        = {https://github.com/google-research/preprocessor-aware-black-box-attack},
  arxiv       = {2210.03297},
  pdf         = {https://proceedings.mlr.press/v202/sitawarin23a.html},
  preview     = {prep_attack_thumbnail.png},
  bibtex_show = {true},
  slides      = {preprocessor_attack_slides_icml2023.pdf},
  poster      = {preprocessor_attack_poster_icml2023.pdf},
  bibtex_show = {false},
  talk        = {https://recorder-v3.slideslive.com/?share=83549&s=94c40a2b-dbe1-405e-a154-d14aa73cc73e}
}

@inproceedings{sitawarin_partbased_2023,
  abbr        = {ICLR},
  abstract    = {We show that combining human prior knowledge with end-to-end learning can improve the robustness of deep neural networks by introducing a part-based model for object classification. We believe that the richer form of annotation helps guide neural networks to learn more robust features without requiring more samples or larger models. Our model combines a part segmentation model with a tiny classifier and is trained end-to-end to simultaneously segment objects into parts and then classify the segmented object. Empirically, our part-based models achieve both higher accuracy and higher adversarial robustness than a ResNet-50 baseline on all three datasets. For instance, the clean accuracy of our part models is up to 15 percentage points higher than the baseline's, given the same level of robustness. Our experiments indicate that these models also reduce texture bias and yield better robustness against common corruptions and spurious correlations. The code is publicly available at https://github.com/chawins/adv-part-model.},
  title       = {Part-Based Models Improve Adversarial Robustness},
  booktitle   = {International Conference on Learning Representations},
  author      = {Sitawarin, Chawin and Pongmala, Kornrapat and Chen, Yizheng and Carlini, Nicholas and Wagner, David},
  year        = {2023},
  copyright   = {All rights reserved},
  url         = {https://openreview.net/forum?id=bAMTaeqluh4},
  pdf         = {https://openreview.net/pdf?id=bAMTaeqluh4},
  month       = may,
  arxiv       = {2209.09117},
  code        = {https://github.com/chawins/adv-part-model},
  slides      = {part_model_slides_iclr2023.pdf},
  bibtex_show = {false},
  talk        = {https://recorder-v3.slideslive.com/?share=83549&s=94c40a2b-dbe1-405e-a154-d14aa73cc73e},
  preview     = {part_model_thumbnail.png}
}

@inproceedings{xiang_short_2023,
  abbr        = {VehicleSec},
  title       = {Short: Certifiably Robust Perception against Adversarial Patch Attacks: A Survey},
  booktitle   = {1st Symposium on {{Vehicle Security}} and {{Privacy}} ({{VehicleSec}})},
  author      = {Xiang, Chong and Sitawarin, Chawin and Wu, Tong and Mittal, Prateek},
  year        = {2023},
  month       = mar,
  abstract    = {The physical-world adversarial patch attack poses a security threat to AI perception models in autonomous vehicles. To mitigate this threat, researchers have designed defenses with certifiable robustness. In this paper, we survey existing certifiably robust defenses and highlight core robustness techniques that are applicable to a variety of perception tasks, including classification, detection, and segmentation. We emphasize the unsolved problems in this space to guide future research, and call for attention and efforts from both academia and industry to robustify perception models in autonomous vehicles.},
  copyright   = {All rights reserved},
  langid      = {english},
  pdf         = {https://www.ndss-symposium.org/wp-content/uploads/2023/02/vehiclesec2023-23020-paper.pdf},
  note        = {Co-located with {{NDSS}} 2023},
  bibtex_show = {false},
  code        = {https://github.com/inspire-group/patch-defense-leaderboard},
  poster      = {https://drive.google.com/file/d/1vPfkU0tPVQ-aBC_mEa_LsYZkmjPqyiEP/view},
  slides      = {https://docs.google.com/presentation/d/1PEQrz8lD0Bzl6QCtBUzk2xULgsRgGRTm},
  talk        = {https://drive.google.com/file/d/1XXZkufk2ueLvNl7QUlxevyeNeEDjI-70/view},
  preview     = {patch_cert_survey_thumbnail.png},
  award       = {Best Short/WIP Paper Award Runner-Up},
  award_name  = {Best Paper Runner-Up}
}

@inproceedings{huang_stronger_2024,
  abbr        = {Preprint},
  title       = {Stronger Universal and Transfer Attacks by Suppressing Refusals},
  booktitle   = {Neurips Safe Generative {{AI}} Workshop 2024},
  author      = {Huang, David and Shah, Avidan and Araujo, Alexandre and Wagner, David and Sitawarin, Chawin},
  abstract    = {Making large language models (LLMs) safe for mass deployment is a complex and ongoing challenge. Efforts have focused on aligning models to human preferences (RLHF) in order to prevent malicious uses, essentially embedding a ``safety feature'' into the model's parameters. The Greedy Coordinate Gradient (GCG) algorithm (Zou et al., 2023b) emerges as one of the most popular automated jailbreaks, an attack that circumvents this safety training. So far, it is believed that these optimization-based attacks are sample-specific as opposed to hand-crafted ones. To make the automated jailbreak universal and transferable, they require incorporating multiple samples and models into the objective function. Contrary to this belief, we find that the adversarial prompts discovered by such optimizers are inherently prompt-universal and transferable, even when optimized on a single model and a single harmful request. To further amplify this phenomenon, we introduce a new objective to these optimizers to explicitly deactivate the safety feature to create an even stronger universal and transferable attack. Without requiring a large number of queries or accessing output token probabilities, our transfer attack, optimized on Llama-3, achieves a 92% success rate against the state-of-the-art Circuit Breaker defense, compared to 2.5% by white-box GCG. Crucially, our method also attains state-of-the-art transfer rates on frontier models: GPT-3.5-Turbo (96%), GPT-4o (82%), and GPT-4o-mini (88%).},
  year        = {2024},
  month       = oct,
  note        = {Full version is under submission.},
  url         = {https://openreview.net/forum?id=eIBWRAbhND},
  bibtex_show = {true},
  preview     = {iris_thumbnail.png}
}


@inproceedings{piet_markmywords_2024,
  abbr        = {Workshop},
  title       = {{{MarkMyWords}}: {{Analyzing}} and Evaluating Language Model Watermarks},
  booktitle   = {Statistical Foundations of Llms and Foundation Models ({{NeurIPS}} 2024 Workshop)},
  author      = {Piet, Julien and Sitawarin, Chawin and Fang, Vivian and Mu, Norman and Wagner, David},
  year        = {2024},
  month       = oct,
  url         = {https://openreview.net/forum?id=lZJCtOWdoU},
  keywords    = {Benchmark,LLM,Watermark},
  abstract    = {The capabilities of large language models have grown significantly in recent years and so too have concerns about their misuse. In this context, the ability to distinguish machine-generated text from human-authored content becomes important. Prior works have proposed numerous schemes to watermark text, which would benefit from a systematic evaluation framework. This work focuses on text watermarking techniques - as opposed to image watermarks - and proposes a comprehensive benchmark for them under different tasks as well as practical attacks. We focus on three main metrics: quality, size (e.g. the number of tokens needed to detect a watermark), and tamper-resistance. Current watermarking techniques are good enough to be deployed: Kirchenbauer et al. can watermark Llama2-7B-chat with no perceivable loss in quality in under 100 tokens, and with good tamper-resistance to simple attacks, regardless of temperature. We argue that watermark indistinguishability is too strong a requirement: schemes that slightly modify logit distributions outperform their indistinguishable counterparts with no noticeable loss in generation quality. We publicly release our benchmark.},
  arxiv       = {2312.00273},
  pdf         = {https://openreview.net/forum?id=lZJCtOWdoU},
  code        = {https://github.com/wagner-group/MarkMyWords},
  bibtex_show = {true},
  website     = {https://wagner-group.github.io/projects/markmywords/index.html},
  preview     = {markmywords_thumbnail.jpg}
}

@inproceedings{chen_struq_2024,
  abbr        = {USENIX Security},
  title       = {{{StruQ}}: Defending against Prompt Injection with Structured Queries},
  booktitle   = {34th {{USENIX}} Security Symposium ({{USENIX}} Security 25)},
  author      = {Chen, Sizhe and Piet, Julien and Sitawarin, Chawin and Wagner, David},
  year        = {2024},
  month       = feb,
  abstract    = {Recent advances in Large Language Models (LLMs) enable exciting LLM-integrated applications, which perform text-based tasks by utilizing their advanced language understanding capabilities. However, as LLMs have improved, so have the attacks against them. Prompt injection attacks are an important threat: they trick the model to deviate from the original application's instructions and instead follow user directives. These attacks rely on the LLM's ability to follow instructions and inability to separate the prompts and user data. We introduce structured queries, a general approach to tackle this problem. Structured queries separate prompts and data into two channels. We implement a system that supports structured queries. This system is made of (1) a secure front-end that formats a prompt and user data into a special format, and (2) a specially trained LLM that can produce high-quality outputs from these inputs. The LLM is trained using a novel fine-tuning strategy: we convert a base (non-instruction-tuned) LLM to a structured instruction-tuned model that will only follow instructions in the prompt portion of a query. To do so, we augment standard instruction tuning datasets with examples that also include instructions in the data portion of the query, and fine-tune the model to ignore these. Our system significantly improves resistance to prompt injection attacks, with little or no impact on utility. Our code is released at https://github.com/Sizhe-Chen/PromptInjectionDefense.},
  url         = {http://arxiv.org/abs/2402.06363},
  bibtex_show = {false},
  arxiv       = {2402.06363},
  code        = {https://github.com/sizhe-chen/struq},
  preview     = {struq_thumbnail.png}
}

@inproceedings{li_oodrobustbench_2024,
  abbr        = {ICML},
  title       = {{{OODRobustBench}}: A Benchmark and Large-Scale Analysis of Adversarial Robustness under Distribution Shift},
  booktitle   = {Proceedings of the 41st International Conference on Machine Learning},
  author      = {Li, Lin and Wang, Yifei and Sitawarin, Chawin and Spratling, Michael W.},
  editor      = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix},
  year        = {2024},
  month       = jul,
  series      = {Proceedings of Machine Learning Research},
  volume      = {235},
  pages       = {28830--28869},
  publisher   = {PMLR},
  url         = {https://proceedings.mlr.press/v235/li24bp.html},
  abstract    = {Existing works have made great progress in improving adversarial robustness, but typically test their method only on data from the same distribution as the training data, i.e. in-distribution (ID) testing. As a result, it is unclear how such robustness generalizes under input distribution shifts, i.e. out-of-distribution (OOD) testing. This omission is concerning as such distribution shifts are unavoidable when methods are deployed in the wild. To address this issue we propose a benchmark named OODRobustBench to comprehensively assess OOD adversarial robustness using 23 dataset-wise shifts (i.e. naturalistic shifts in input distribution) and 6 threat-wise shifts (i.e., unforeseen adversarial threat models). OODRobustBench is used to assess 706 robust models using 60.7K adversarial evaluations. This large-scale analysis shows that: 1) adversarial robustness suffers from a severe OOD generalization issue; 2) ID robustness correlates strongly with OOD robustness in a positive linear way. The latter enables the prediction of OOD robustness from ID robustness. We then predict and verify that existing methods are unlikely to achieve high OOD robustness. Novel methods are therefore required to achieve OOD robustness beyond our prediction. To facilitate the development of these methods, we investigate a wide range of techniques and identify several promising directions. Code and models are available at: https://github.com/OODRobustBench/OODRobustBench.},
  arxiv       = {2310.12793},
  bibtex_show = {true},
  pdf         = {https://openreview.net/forum?id=RnYd44LR2v},
  code        = {https://github.com/oodrobustbench/oodrobustbench},
  preview     = {oodrobust_thumbnail.png}
}

@inproceedings{ding_vulnerability_2024,
  abbr        = {ICSE},
  author      = {Ding, Yangruibo and Fu, Yanjun and Ibrahim, Omniyyah and Sitawarin, Chawin and Chen, Xinyun and Alomair, Basel and Wagner, David and Ray, Baishakhi and Chen, Yizheng},
  title       = {Vulnerability Detection with Code Language Models: How Far Are We?},
  year        = {2024},
  month       = mar,
  publisher   = {Association for Computing Machinery},
  address     = {New York, NY, USA},
  abstract    = {In the context of the rising interest in code language models (code LMs) and vulnerability detection, we study the effectiveness of code LMs for detecting vulnerabilities. Our analysis reveals significant shortcomings in existing vulnerability datasets, including poor data quality, low label accuracy, and high duplication rates, leading to unreliable model performance in realistic vulnerability detection scenarios. Additionally, the evaluation methods used with these datasets are not representative of real-world vulnerability detection. To address these challenges, we introduce PrimeVul, a new dataset for training and evaluating code LMs for vulnerability detection. PrimeVul incorporates a novel set of data labeling techniques that achieve comparable label accuracy to human-verified benchmarks while significantly expanding the dataset. It also implements a rigorous data de-duplication and chronological data splitting strategy to mitigate data leakage issues, alongside introducing more realistic evaluation metrics and settings. This comprehensive approach aims to provide a more accurate assessment of code LMs' performance in real-world conditions. Evaluating code LMs on PrimeVul reveals that existing benchmarks significantly overestimate the performance of these models. For instance, a state-of-the-art 7B model scored 68.26\% F1 on BigVul but only 3.09\% F1 on PrimeVul. Attempts to improve performance through advanced training techniques and larger models like GPT-3.5 and GPT-4 were unsuccessful, with results akin to random guessing in the most stringent settings. These findings underscore the considerable gap between current capabilities and the practical requirements for deploying code LMs in security roles, highlighting the need for more innovative research in this domain.},
  url         = {http://arxiv.org/abs/2403.18624},
  booktitle   = {Proceedings of the IEEE/ACM 47th International Conference on Software Engineering},
  series      = {ICSE '25},
  bibtex_show = {true},
  arxiv       = {2403.18624},
  code        = {https://github.com/dlvuldet/primevul},
  preview     = {primevul_thumbnail.png}
}

@inproceedings{piet_jatmo_2024,
  abbr        = {ESORICS},
  title       = {Jatmo: Prompt Injection Defense by Task-Specific Finetuning},
  shorttitle  = {Jatmo},
  author      = {Piet{${*}$}, Julien and Alrashed{${*}$}, Maha and Sitawarin, Chawin and Chen, Sizhe and Wei, Zeming and Sun, Elizabeth and Alomair, Basel and Wagner, David},
  year        = {2024},
  month       = mar,
  url         = {http://arxiv.org/abs/2312.17673},
  booktitle   = {Computer {{Security}} -- {{ESORICS}} 2024},
  urldate     = {2024-01-03},
  abstract    = {Large Language Models (LLMs) are attracting significant research attention due to their instruction-following abilities, allowing users and developers to leverage LLMs for a variety of tasks. However, LLMs are vulnerable to prompt-injection attacks: a class of attacks that hijack the model's instruction-following abilities, changing responses to prompts to undesired, possibly malicious ones. In this work, we introduce Jatmo, a method for generating task-specific models resilient to prompt-injection attacks. Jatmo leverages the fact that LLMs can only follow instructions once they have undergone instruction tuning. It harnesses a teacher instruction-tuned model to generate a task-specific dataset, which is then used to fine-tune a base model (i.e., a non-instruction-tuned model). Jatmo only needs a task prompt and a dataset of inputs for the task: it uses the teacher model to generate outputs. For situations with no pre-existing datasets, Jatmo can use a single example, or in some cases none at all, to produce a fully synthetic dataset. Our experiments on six tasks show that Jatmo models provide the same quality of outputs on their specific task as standard LLMs, while being resilient to prompt injections. The best attacks succeeded in less than 0.5\% of cases against our models, versus over 90\% success rate against GPT-3.5-Turbo. We release Jatmo at https://github.com/wagner-group/prompt-injection-defense.},
  copyright   = {All rights reserved},
  bibtex_show = {false},
  pdf         = {https://dl.acm.org/doi/abs/10.1007/978-3-031-70879-4_6},
  code        = {https://github.com/wagner-group/prompt-injection-defense},
  arxiv       = {2312.17673},
  preview     = {jatmo_thumbnail.png}
}

@article{sitawarin_pal_2024,
  abbr          = {Preprint},
  title         = {{{PAL}}: Proxy-Guided Black-Box Attack on Large Language Models},
  shorttitle    = {{{PAL}}},
  author        = {Sitawarin, Chawin and Mu, Norman and Wagner, David and Araujo, Alexandre},
  year          = {2024},
  month         = feb,
  number        = {arXiv:2402.09674},
  eprint        = {2402.09674},
  primaryclass  = {cs},
  publisher     = {{arXiv}},
  url           = {http://arxiv.org/abs/2402.09674},
  urldate       = {2024-02-16},
  abstract      = {Large Language Models (LLMs) have surged in popularity in recent months, but they have demonstrated concerning capabilities to generate harmful content when manipulated. While techniques like safety fine-tuning aim to minimize harmful use, recent works have shown that LLMs remain vulnerable to attacks that elicit toxic responses. In this work, we introduce the Proxy-Guided Attack on LLMs (PAL), the first optimization-based attack on LLMs in a black-box query-only setting. In particular, it relies on a surrogate model to guide the optimization and a sophisticated loss designed for real-world LLM APIs. Our attack achieves 84\% attack success rate (ASR) on GPT-3.5-Turbo and 48\% on Llama-2-7B, compared to 4\% for the current state of the art. We also propose GCG++, an improvement to the GCG attack that reaches 94\% ASR on white-box Llama-2-7B, and the Random-Search Attack on LLMs (RAL), a strong but simple baseline for query-based attacks. We believe the techniques proposed in this work will enable more comprehensive safety testing of LLMs and, in the long term, the development of better security guardrails. The code can be found at https://github.com/chawins/pal.},
  archiveprefix = {arxiv},
  copyright     = {Creative Commons Attribution 4.0 International License (CC-BY)},
  keywords      = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Cryptography and Security,Computer Science - Machine Learning},
  bibtex_show   = {false},
  code          = {https://github.com/chawins/pal},
  arxiv         = {2402.09674},
  journal       = {Under submission},
  preview       = {pal_thumbnail.png}
}

@inproceedings{sitawarin_defending_2024,
  title         = {PubDef: Defending against Transfer Attacks from Public Models},
  booktitle     = {The Twelfth International Conference on Learning Representations},
  author        = {Sitawarin, Chawin and Chang{${*}$}, Jaewon and Huang{${*}$}, David and Altoyan, Wesson and Wagner, David},
  year          = {2024},
  month         = jan,
  copyright     = {CC0 1.0 Universal Public Domain Dedication},
  url           = {https://openreview.net/forum?id=Tvwf4Vsi5F},
  abstract      = {Adversarial attacks have been a looming and unaddressed threat in the industry. However, through a decade-long history of the robustness evaluation literature, we have learned that mounting a strong or optimal attack is challenging. It requires both machine learning and domain expertise. In other words, the white-box threat model, religiously assumed by a large majority of the past literature, is unrealistic. In this paper, we propose a new practical threat model where the adversary relies on transfer attacks through publicly available surrogate models. We argue that this setting will become the most prevalent for security-sensitive applications in the future. We evaluate the transfer attacks in this setting and propose a specialized defense method based on a game-theoretic perspective. The defenses are evaluated under 24 public models and 11 attack algorithms across three datasets (CIFAR-10, CIFAR-100, and ImageNet). Under this threat model, our defense, PubDef, outperforms the state-of-the-art white-box adversarial training by a large margin with almost no loss in the normal accuracy. For instance, on ImageNet, our defense achieves 62% accuracy under the strongest transfer attack vs only 36% of the best adversarially trained model. Its accuracy when not under attack is only 2% lower than that of an undefended model (78% vs 80%).},
  archiveprefix = {arxiv},
  keywords      = {Computer Science - Artificial Intelligence,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Cryptography and Security,Computer Science - Machine Learning,notion},
  code          = {https://github.com/wagner-group/pubdef},
  bibtex_show   = {true},
  abstract      = {Existing works have made great progress in improving adversarial robustness, but typically test their method only on data from the same distribution as the training data, i.e. in-distribution (ID) testing. As a result, it is unclear how such robustness generalizes under input distribution shifts, i.e. out-of-distribution (OOD) testing. This is a concerning omission as such distribution shifts are unavoidable when methods are deployed in the wild. To address this issue we propose a benchmark named OODRobustBench to comprehensively assess OOD adversarial robustness using 23 dataset-wise shifts (i.e. naturalistic shifts in input distribution) and 6 threat-wise shifts (i.e., unforeseen adversarial threat models). OODRobustBench is used to assess 706 robust models using 60.7K adversarial evaluations. This large-scale analysis shows that: 1) adversarial robustness suffers from a severe OOD generalization issue; 2) ID robustness correlates strongly with OOD robustness, in a positive linear way, under many distribution shifts. The latter enables the prediction of OOD robustness from ID robustness. Based on this, we are able to predict the upper limit of OOD robustness for existing robust training schemes. The results suggest that achieving OOD robustness requires designing novel methods beyond the conventional ones. Last, we discover that extra data, data augmentation, advanced model architectures and particular regularization approaches can improve OOD robustness. Noticeably, the discovered training schemes, compared to the baseline, exhibit dramatically higher robustness under threat shift while keeping high ID robustness, demonstrating new promising solutions for robustness against both multi-attack and unforeseen attacks.},
  abbr          = {ICLR},
  arxiv         = {2310.17645},
  preview       = {pubdef_thumbnail.png},
  pdf           = {https://openreview.net/forum?id=Tvwf4Vsi5F},
  website       = {https://wagner-group.github.io/projects/pubdef/index.html},
  talk          = {https://recorder-v3.slideslive.com/?share=92103&s=183329c8-37ce-42c4-a41a-fff6998c8055},
  slides        = {pubdef_iclr2024_slides.pdf}
}

@inproceedings{shah_spder_2024,
  abbr        = {ICLR},
  title       = {{{SPDER}}: Semiperiodic Damping-Enabled Object Representation},
  booktitle   = {The Twelfth International Conference on Learning Representations},
  abstract    = {We present a neural network architecture designed to naturally learn a positional embedding and overcome the spectral bias towards lower frequencies faced by conventional implicit neural representation networks. Our proposed architecture, SPDER, is a simple MLP that uses an activation function composed of a sinusoidal multiplied by a sublinear function, called the damping function. The sinusoidal enables the network to automatically learn the positional embedding of an input coordinate while the damping passes on the actual coordinate value by preventing it from being projected down to within a finite range of values. Our results indicate that SPDERs speed up training by 10x and converge to losses 1,500-50,000x lower than that of the state-of-the-art for image representation. SPDER is also state-of-the-art in audio representation. The superior representation capability allows SPDER to also excel on multiple downstream tasks such as image super-resolution and video frame interpolation. We provide intuition as to why SPDER significantly improves fitting compared to that of other INR methods while requiring no hyperparameter tuning or preprocessing.},
  author      = {Shah, Kathan and Sitawarin, Chawin},
  year        = {2024},
  month       = jan,
  copyright   = {All rights reserved},
  bibtex_show = {true},
  arxiv       = {2306.15242},
  pdf         = {https://openreview.net/forum?id=92btneN9Wm},
  preview     = {spder_thumbnail.png},
  code        = {https://github.com/katop1234/SPDER/},
  url         = {https://openreview.net/forum?id=92btneN9Wm}
}