diff --git a/.gitignore b/.gitignore
index e158bf16..82ec436f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 .idea/
-run/
\ No newline at end of file
+run/
+__pycache__/
+web/
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index 3ef228dd..57f7fed2 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -260,6 +260,17 @@ files = [
     {file = "cachetools-5.3.2.tar.gz", hash = "sha256:086ee420196f7b2ab9ca2db2520aca326318b68fe5ba8bc4d49cca91add450f2"},
 ]
 
+[[package]]
+name = "casefy"
+version = "0.1.7"
+description = "Utilities for string case conversion."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "casefy-0.1.7-py3-none-any.whl", hash = "sha256:ab05ff1c67f2a8e62d9f8986fa9a849416d61ac5413ec57d1f827b4f36589cf6"},
+    {file = "casefy-0.1.7.tar.gz", hash = "sha256:6accce985a64b9edb2a610a29ac489d78fac80e52ff8f2d137e294f2f92b8027"},
+]
+
 [[package]]
 name = "catalogue"
 version = "2.0.10"
@@ -502,13 +513,13 @@ srsly = ">=2.4.0,<3.0.0"
 
 [[package]]
 name = "crfm-helm"
-version = "0.3.0"
+version = "0.4.0"
 description = "Benchmark for language models"
 optional = false
 python-versions = "<3.11,>=3.8"
 files = [
-    {file = "crfm-helm-0.3.0.tar.gz", hash = "sha256:1a9b75e2e544e8c2fc497fa4bf32c581f6c93f833c969e17843eb0bcea1f98ca"},
-    {file = "crfm_helm-0.3.0-py3-none-any.whl", hash = "sha256:6256ee699461bb7551476201fc3190b19160a758436fe4ccee563d3110d28498"},
+    {file = "crfm-helm-0.4.0.tar.gz", hash = "sha256:06d49ad3c3c07eae67898e204c856b75e96a20c93e6cf8f20e56bce2c13cdaa3"},
+    {file = "crfm_helm-0.4.0-py3-none-any.whl", hash = "sha256:3fc9c3721f78f48632cad6dfb04851de7055f83d900125fd8d247b9503a99a27"},
 ]
 
 [package.dependencies]
@@ -524,7 +535,6 @@ numpy = ">=1.23.3,<1.24.0"
 pyarrow = ">=11.0.0"
 pyext = ">=0.7,<1.0"
 pyhocon = ">=0.3.59,<0.4.0"
-pymongo = ">=4.2.0,<4.3.0"
 retrying = ">=1.3.4,<1.4.0"
 rouge-score = ">=0.1.2,<0.2.0"
 scikit-learn = ">=1.1.2,<1.2.0"
@@ -540,14 +550,16 @@ zstandard = ">=0.18.0,<0.19.0"
 
 [package.extras]
 aleph-alpha = ["aleph-alpha-client (>=2.14.0,<2.15.0)", "tokenizers (>=0.13.3,<0.14.0)"]
-all = ["crfm-helm[cleva]", "crfm-helm[human-evaluation]", "crfm-helm[images]", "crfm-helm[metrics]", "crfm-helm[models]", "crfm-helm[plots]", "crfm-helm[proxy-server]", "crfm-helm[scenarios]", "crfm-helm[slurm]"]
+all = ["crfm-helm[cleva]", "crfm-helm[human-evaluation]", "crfm-helm[images]", "crfm-helm[metrics]", "crfm-helm[models]", "crfm-helm[mongo]", "crfm-helm[plots]", "crfm-helm[proxy-server]", "crfm-helm[scenarios]", "crfm-helm[slurm]"]
 anthropic = ["anthropic (>=0.2.5,<0.3.0)", "websocket-client (>=1.3.2,<1.4.0)"]
 cleva = ["jieba (==0.42.1)", "langdetect (==1.0.9)", "opencc (==1.1.6)", "pypinyin (==0.49.0)", "unidecode (==1.3.6)"]
-dev = ["black (>=22.10.0,<22.11.0)", "flake8 (>=5.0.4,<5.1.0)", "mypy (>=0.982,<1.0)", "pre-commit (>=2.20.0,<2.21.0)", "pytest (>=7.2.0,<7.3.0)"]
+dev = ["black (>=22.10.0,<22.11.0)", "flake8 (>=5.0.4,<5.1.0)", "mypy (>=1.5.1,<1.6.0)", "pre-commit (>=2.20.0,<2.21.0)", "pytest (>=7.2.0,<7.3.0)"]
+google = ["google-cloud-aiplatform (>=1.36.4,<1.37.0)"]
 human-evaluation = ["scaleapi (>=2.13.0,<2.14.0)", "surge-api (>=1.1.0,<1.2.0)"]
 images = ["accelerate (>=0.23.0,<0.24.0)", "pillow (>=9.4.0,<9.5.0)"]
 metrics = ["numba (>=0.56.4,<0.57.0)", "pytrec-eval (==0.5)", "sacrebleu (>=2.2.1,<2.3.0)", "summ-eval (>=0.892,<1.0)"]
-models = ["crfm-helm[aleph-alpha]", "crfm-helm[anthropic]", "crfm-helm[openai]", "crfm-helm[tsinghua]", "crfm-helm[yandex]"]
+models = ["crfm-helm[aleph-alpha]", "crfm-helm[anthropic]", "crfm-helm[google]", "crfm-helm[openai]", "crfm-helm[tsinghua]", "crfm-helm[yandex]"]
+mongo = ["pymongo (>=4.2.0,<4.3.0)"]
 openai = ["openai (>=0.27.8,<0.28.0)", "tiktoken (>=0.3.3,<0.4.0)"]
 plots = ["colorcet (>=3.0.1,<3.1.0)", "matplotlib (>=3.6.0,<3.7.0)", "seaborn (>=0.11.0,<0.12.0)"]
 proxy-server = ["gunicorn (>=20.1.0,<20.2.0)"]
@@ -1058,7 +1070,6 @@ files = [
     {file = "jq-1.6.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:227b178b22a7f91ae88525810441791b1ca1fc71c86f03190911793be15cec3d"},
     {file = "jq-1.6.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:780eb6383fbae12afa819ef676fc93e1548ae4b076c004a393af26a04b460742"},
     {file = "jq-1.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:08ded6467f4ef89fec35b2bf310f210f8cd13fbd9d80e521500889edf8d22441"},
-    {file = "jq-1.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:49e44ed677713f4115bd5bf2dbae23baa4cd503be350e12a1c1f506b0687848f"},
     {file = "jq-1.6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:984f33862af285ad3e41e23179ac4795f1701822473e1a26bf87ff023e5a89ea"},
     {file = "jq-1.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f42264fafc6166efb5611b5d4cb01058887d050a6c19334f6a3f8a13bb369df5"},
     {file = "jq-1.6.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a67154f150aaf76cc1294032ed588436eb002097dd4fd1e283824bf753a05080"},
@@ -2102,91 +2113,6 @@ pyparsing = {version = ">=2,<4", markers = "python_version >= \"3.0\""}
 [package.extras]
 duration = ["python-dateutil (>=2.8.0)"]
 
-[[package]]
-name = "pymongo"
-version = "4.2.0"
-description = "Python driver for MongoDB <http://www.mongodb.org>"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "pymongo-4.2.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:b9e4981a65f8500a3a46bb3a1e81b9feb45cf0b2115ad9c4f8d517326d026940"},
-    {file = "pymongo-4.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1c81414b706627f15e921e29ae2403aab52e33e36ed92ed989c602888d7c3b90"},
-    {file = "pymongo-4.2.0-cp310-cp310-manylinux1_i686.whl", hash = "sha256:c549bb519456ee230e92f415c5b4d962094caac0fdbcc4ed22b576f66169764e"},
-    {file = "pymongo-4.2.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:70216ec4c248213ae95ea499b6314c385ce01a5946c448fb22f6c8395806e740"},
-    {file = "pymongo-4.2.0-cp310-cp310-manylinux2014_i686.whl", hash = "sha256:8a86e8c2ac2ec87141e1c6cb00bdb18a4560f06e5f96769abcd1dda24dc0e764"},
-    {file = "pymongo-4.2.0-cp310-cp310-manylinux2014_ppc64le.whl", hash = "sha256:314b556afd72eb21a6a10bd1f45ef252509f014f80207db59c97372103c88237"},
-    {file = "pymongo-4.2.0-cp310-cp310-manylinux2014_s390x.whl", hash = "sha256:902e2c9030cb042c49750bc70d72d830d42c64ea0df5ff8630c171e065c93dd7"},
-    {file = "pymongo-4.2.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:c69ef5906dcd6ec565d4d887ba97ceb2a84f3b614307ee3b4780cb1ea40b1867"},
-    {file = "pymongo-4.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07564178ecc203a84f63e72972691af6c0c82d2dc0c9da66ba711695276089ba"},
-    {file = "pymongo-4.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f47d5f10922cf7f7dfcd1406bd0926cef6d866a75953c3745502dffd7ac197dd"},
-    {file = "pymongo-4.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4cadaaa5c19ad23fc84559e90284f2eb003c36958ebb2c06f286b678f441285f"},
-    {file = "pymongo-4.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d94f535df9f539615bc3dbbef185ded3b609373bb44ca1afffcabac70202678a"},
-    {file = "pymongo-4.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:147a23cd96feb67606ac957744d8d25b013426cdc3c7164a4f99bd8253f649e3"},
-    {file = "pymongo-4.2.0-cp310-cp310-win32.whl", hash = "sha256:ecdcb0d4e9b08b739035f57a09330efc6f464bd7f942b63897395d996ca6ebd5"},
-    {file = "pymongo-4.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:8c223aea52c359cc8fdee5bd3475532590755c269ec4d4fe581acd47a44e9952"},
-    {file = "pymongo-4.2.0-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:fe0820d169635e41c14a5d21514282e0b93347878666ec9d5d3bf0eed0649948"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e39cacee70a98758f9b2da53ee175378f07c60113b1fa4fae40cbaee5583181e"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:701d331060dae72bf3ebdb82924405d14136a69282ccb00c89fc69dee21340b4"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:e08fe1731f5429435b8dea1db9663f9ed1812915ff803fc9991c7c4841ed62ad"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:60c470a58c5b62b1b12a5f5458f8e2f2f67b94e198d03dc5352f854d9230c394"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:b211e161b6cc2790e0d640ad38e0429d06c944e5da23410f4dc61809dba25095"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:ed90a9de4431cbfb2f3b2ef0c5fd356e61c85117b2be4db3eae28cb409f6e2d5"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:68e1e49a5675748233f7b05330f092582cd52f2850b4244939fd75ba640593ed"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:764fc15418d94bce5c2f8ebdbf66544f96f42efb1364b61e715e5b33281b388d"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e64442aba81ed4df1ca494b87bf818569a1280acaa73071c68014f7a884e83f1"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:83168126ae2457d1a19b2af665cafa7ef78c2dcff192d7d7b5dad6b36c73ae24"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69d0180bca594e81cdb4a2af328bdb4046f59e10aaeef7619496fe64f2ec918c"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80cbf0b043061451660099fff9001a7faacb2c9c983842b4819526e2f944dc6c"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e1b8f5e2f9637492b0da4d51f78ecb17786e61d6c461ead8542c944750faf4f9"},
-    {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1a957cdc2b26eeed4d8f1889a40c6023dd1bd94672dd0f5ce327314f2caaefd4"},
-    {file = "pymongo-4.2.0-cp37-cp37m-win32.whl", hash = "sha256:6bd5888997ea3eae9830c6cc7964b61dcfbc50eb3a5a6ce56ad5f86d5579b11c"},
-    {file = "pymongo-4.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:dc24737d24ce0de762bee9c2a884639819485f679bbac8ab5be9c161ef6f9b2c"},
-    {file = "pymongo-4.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:193cc97d44b1e6d2253ea94e30c6f94f994efb7166e2452af4df55825266e88b"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:e152c26ffc30331e9d57591fc4c05453c209aa20ba299d1deb7173f7d1958c22"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8a9bc4dcfc2bda69ee88cdb7a89b03f2b8eca668519b704384a264dea2db4209"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8cbb868e88c4eee1c53364bb343d226a3c0e959e791e6828030cb78f46cfcbe3"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:2bfe6b59f431f40fa545547616f4acf0c0c4b64518b1f951083e3bad06eb368b"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:ff66014687598823b6b23751884b4aa67eb934445406d95894dfc60cb7bfcc18"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:31c50da4a080166bc29403aa91f4c76e0889b4f24928d1b60508a37c1bf87f9a"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:ccfdc7722df445c49dc6b5d514c3544cad99b53189165f7546793933050ac7fb"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc7ebc37b03956a070260665079665eae69e5e96007694214f3a2107af96816a"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8b4a782aac43948308087b962c9ecb030ba98886ce6dee3ad7aafe8c5e1ce80"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1c23527f8e13f526fededbb96f2e7888f179fe27c51d41c2724f7059b75b2fa"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83cc3c35aeeceb67143914db67f685206e1aa37ea837d872f4bc28d7f80917c9"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e09cdf5aad507c8faa30d97884cc42932ed3a9c2b7f22cc3ccc607bae03981b3"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0f53253f4777cbccc426e669a2af875f26c95bd090d88593287b9a0a8ac7fa25"},
-    {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:21238b19243a42f9a34a6d39e7580ceebc6da6d2f3cf729c1cff9023cb61a5f1"},
-    {file = "pymongo-4.2.0-cp38-cp38-win32.whl", hash = "sha256:766acb5b1a19eae0f7467bcd3398748f110ea5309cdfc59faa5185dcc7fd4dca"},
-    {file = "pymongo-4.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:10f09c4f09757c2e2a707ad7304f5d69cb8fdf7cbfb644dbacfe5bbe8afe311b"},
-    {file = "pymongo-4.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a6bf01b9237f794fa3bdad5089474067d28be7e199b356a18d3f247a45775f26"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:d8bb745321716e7a11220a67c88212ecedde4021e1de4802e563baef9df921d2"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:3be53e9888e759c49ae35d747ff77a04ff82b894dd64601e0f3a5a159b406245"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:a3efdf154844244e0dabe902cf1827fdced55fa5b144adec2a86e5ce50a99b97"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:a7eb5b06744b911b6668b427c8abc71b6d624e72d3dfffed00988fa1b4340f97"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:b0be613d926c5dbb0d3fc6b58e4f2be4979f80ae76fda6e47309f011b388fe0c"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:e7dcb73f683c155885a3488646fcead3a895765fed16e93c9b80000bc69e96cb"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:b537dd282de1b53d9ae7cf9f3df36420c8618390f2da92100391f3ba8f3c141a"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d98d2a8283c9928a9e5adf2f3c0181e095579e9732e1613aaa55d386e2bcb6c5"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76892bbce743eb9f90360b3626ea92f13d338010a1004b4488e79e555b339921"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:124d0e880b66f9b0778613198e89984984fdd37a3030a9007e5f459a42dfa2d3"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:773467d25c293f8e981b092361dab5fd800e1ba318403b7959d35004c67faedc"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6673ab3fbf3135cc1a8c0f70d480db5b2378c3a70af8d602f73f76b8338bdf97"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:153b8f8705970756226dfeeb7bb9637e0ad54a4d79b480b4c8244e34e16e1662"},
-    {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:01721da74558f2f64a9f162ee063df403ed656b7d84229268d8e4ae99cfba59c"},
-    {file = "pymongo-4.2.0-cp39-cp39-win32.whl", hash = "sha256:a25c0eb2d610b20e276e684be61c337396813b636b69373c17314283cb1a3b14"},
-    {file = "pymongo-4.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:44b36ccb90aac5ea50be23c1a6e8f24fbfc78afabdef114af16c6e0a80981364"},
-    {file = "pymongo-4.2.0.tar.gz", hash = "sha256:72f338f6aabd37d343bd9d1fdd3de921104d395766bcc5cdc4039e4c2dd97766"},
-]
-
-[package.extras]
-aws = ["pymongo-auth-aws (<2.0.0)"]
-encryption = ["pymongocrypt (>=1.3.0,<2.0.0)"]
-gssapi = ["pykerberos"]
-ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identity (>=18.1.0)"]
-snappy = ["python-snappy"]
-srv = ["dnspython (>=1.16.0,<3.0.0)"]
-zstd = ["zstandard"]
-
 [[package]]
 name = "pyparsing"
 version = "3.1.1"
@@ -2274,6 +2200,7 @@ files = [
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
     {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
+    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
     {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
     {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
     {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@@ -2281,8 +2208,15 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
     {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
+    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
     {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
+    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
+    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
+    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
     {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
     {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@@ -2299,6 +2233,7 @@ files = [
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
     {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
+    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
     {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
     {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
     {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@@ -2306,6 +2241,7 @@ files = [
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
     {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
+    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
     {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
     {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
     {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@@ -3653,4 +3589,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.11"
-content-hash = "a936c5de01f305bdec3631d36eebc375418549fca1be0ad6c7ac2fa810ca151a"
+content-hash = "5cba43411b92ed164c4bbc30fbab94b73a41acdeb14fc104e4915a48e04f189b"
diff --git a/pyproject.toml b/pyproject.toml
index b240e61f..a2862d71 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,9 +12,10 @@ packages = [
 [tool.poetry.dependencies]
 python = ">=3.10,<3.11"
 pyext = {url = "https://files.pythonhosted.org/packages/b0/be/9b6005ac644aaef022527ce49617263379e49dbdbd433d1d3dd66d71f570/pyext-0.7.tar.gz"}
-crfm-helm = "0.3"
+crfm-helm = "^0.4.0"
 jq = "^1.6.0"
 click = "^8.1.7"
+casefy = "^0.1.7"
 
 [tool.poetry.group.dev.dependencies]
 pytest-datafiles = "^3.0.0"
diff --git a/src/coffee/__init__.py b/src/coffee/__init__.py
index e69de29b..4e0f618a 100644
--- a/src/coffee/__init__.py
+++ b/src/coffee/__init__.py
@@ -0,0 +1,6 @@
+class CoffeeConfig:
+    def __init__(self):
+        self.debug = False
+
+
+app_config = CoffeeConfig()
diff --git a/src/coffee/benchmark.py b/src/coffee/benchmark.py
index bc8db834..0a68b60c 100644
--- a/src/coffee/benchmark.py
+++ b/src/coffee/benchmark.py
@@ -1,4 +1,9 @@
 from abc import ABC, abstractmethod
+from typing import List
+
+import casefy
+
+from coffee.helm import BbqHelmTest, HelmTest
 
 
 class Benchmark(ABC):
@@ -11,6 +16,18 @@ def __init__(self, sut, scores):
     def overall_score(self) -> float:
         pass
 
+    @abstractmethod
+    def tests(self) -> List[HelmTest]:
+        pass
+
+    @classmethod
+    def name(cls):
+        return casefy.titlecase(cls.__name__)
+
+    @classmethod
+    def path_name(cls):
+        return casefy.snakecase(cls.__name__)
+
 
 class RidiculousBenchmark(Benchmark):
     def overall_score(self) -> float:
@@ -21,3 +38,7 @@ def overall_score(self) -> float:
             count += 1
             total += bbq[subject]["bbq_accuracy"]
         return total / count * 5
+
+    @classmethod
+    def tests(cls) -> List[HelmTest]:
+        return [BbqHelmTest()]
diff --git a/src/coffee/helm.py b/src/coffee/helm.py
new file mode 100644
index 00000000..c7c3c60e
--- /dev/null
+++ b/src/coffee/helm.py
@@ -0,0 +1,225 @@
+import dataclasses
+import json
+import logging
+import pathlib
+import re
+import subprocess
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from enum import Enum
+from typing import List
+
+import jq
+
+import coffee
+
+# This starts with a bunch of objects that represent things already in HELM code.
+# As we shift HELM to accommodate a library use case, it would be nice to compose
+# a run directly out of objects/enums/constants, or at least compose RunSpecs from
+# exposed pieces that are closely related. E.g., the BbqScenario should know "bbq".
+
+
+@dataclasses.dataclass
+class SutDescription:
+    key: str
+    huggingface: bool = dataclasses.field(repr=False, default=False)
+
+    def __hash__(self):
+        return super().__hash__()
+
+
+class HelmSut(SutDescription, Enum):
+    GPT2 = "openai/gpt2"
+    PYTHIA_70M = "EleutherAI/pythia-70m", True
+    FB_OPT_125M = "facebook/opt-125m", True
+
+
+class HelmTest(ABC):
+    # I would like this to be another enum, but BBQ's structural chaos means
+    # for now we need custom behavior
+    def __init__(self, prefix):
+        super().__init__()
+        self.prefix = prefix
+
+    @abstractmethod
+    def runspecs(self) -> List[str]:
+        pass
+
+    def __str__(self):
+        return self.__class__.__name__ + ":" + self.prefix
+
+
+class BbqHelmTest(HelmTest):
+    def __init__(self):
+        super().__init__("bbq")
+
+    # Copied from HELM because it's not exposed outside the method
+    CATEGORIES = [
+        "Age",
+        "Disability_status",
+        "Gender_identity",
+        "Nationality",
+        "Physical_appearance",
+        "Race_ethnicity",
+        "Race_x_SES",
+        "Race_x_gender",
+        "Religion",
+        "SES",
+        # "Sexual_orientation", TODO: temporarily disabled while Yifan looks into a transformer-related bug
+    ]
+
+    def runspecs(self) -> List[str]:
+        return [f"{self.prefix}:subject={c}" for c in BbqHelmTest.CATEGORIES]
+
+
+class HelmScores:
+    # a kinda hacky container; we won't know the right shape of this for a while, so just use wild dicts
+    def __init__(self):
+        super().__init__()
+        self.data = defaultdict(list)
+
+    def add(self, test, sut, test_sut_scores):
+        self.data[(test.__class__.__name__, sut)].append(test_sut_scores)
+
+    def for_sut(self, desired_sut) -> dict:
+        result: defaultdict = defaultdict(dict)
+        for test, sut in self.data:
+            if sut == desired_sut:
+                for entry in self.data[(test, sut)]:
+                    result[test].update(entry)
+
+        return result
+
+
+class HelmResult:
+    def __init__(
+        self,
+        tests: List[HelmTest],
+        suts: List[HelmSut],
+        output_dir: pathlib.Path,
+        execution_result: subprocess.CompletedProcess,
+    ):
+        super().__init__()
+        self.tests = tests
+        self.suts = suts
+        self.output_dir = output_dir
+        self.execution_result = execution_result
+
+    def load_scores(self):
+        focus = self.output_dir / "benchmark_output" / "runs" / "v1"
+        result = HelmScores()
+        for t in self.tests:
+            for s in self.suts:
+                # long term we'll need a lot more data; this is just enough to compute simple scores
+                test_sut_scores = {}
+                glob_path = f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.key)}*"
+                logging.debug(f"looking for scores for {t} {s} in {focus}/{glob_path}")
+                for d in focus.glob(glob_path):
+                    subject_result = {}
+                    with open(d / "run_spec.json") as f:
+                        j = json.load(f)
+                    subject = (
+                        jq.compile(".scenario_spec.args.subject").input_value(j).first()
+                    )
+                    with open(d / "stats.json") as f:
+                        j = json.load(f)
+                    for stat in j:
+                        if stat["name"]["name"].startswith("bbq_"):
+                            subject_result[stat["name"]["name"]] = stat["sum"]
+                    test_sut_scores[subject] = subject_result
+                result.add(t, s, test_sut_scores)
+        return result
+
+    def helm_stdout(self) -> str:
+        return self._deal_with_bytes(self.execution_result.stdout)
+
+    def helm_stderr(self) -> str:
+        return self._deal_with_bytes(self.execution_result.stderr)
+
+    def _deal_with_bytes(self, o):
+        if isinstance(o, bytes):
+            result = o.decode("utf-8")
+        else:
+            result = str(o)
+        return result
+
+    def _filesystem_safe(self, s: str):
+        # reproducing some behavior in HELM; would be nice to remove duplication
+        return re.sub("/", "_", s)
+
+    def success(self):
+        return self.execution_result and self.execution_result.returncode == 0
+
+
+class HelmRunner(ABC):
+    @abstractmethod
+    def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10):
+        pass
+
+
+class CliHelmRunner(HelmRunner):
+    def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
+        runspecs = []
+        for s in suts:
+            for t in tests:
+                for r in t.runspecs():
+                    runspecs.append(r + ",model=" + s.key)
+        huggingface_models = [s.key for s in suts if s.huggingface]
+
+        command = self._helm_command_for_runspecs(
+            runspecs, max_instances, huggingface_models
+        )
+        logging.debug(f"helm run command: {command}")
+
+        output_dir = self._make_output_dir()
+        execute_result = self._execute(command, output_dir)
+        return HelmResult(tests, suts, output_dir, execute_result)
+
+    def _execute(
+        self, command: List[str], output_dir: pathlib.Path
+    ) -> subprocess.CompletedProcess:
+        if coffee.app_config.debug:
+            return self._run_with_debug_settings(command, output_dir)
+        else:
+            return subprocess.run(
+                " ".join(command), shell=True, capture_output=True, cwd=output_dir
+            )
+
+    def _run_with_debug_settings(self, command, output_dir):
+        with subprocess.Popen(
+            " ".join(command),
+            shell=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            cwd=output_dir,
+        ) as sp:
+            for line in sp.stdout:
+                logging.debug(line.decode().rstrip())
+        return subprocess.CompletedProcess(sp.args, sp.returncode, sp.stdout, sp.stderr)
+
+    def _make_output_dir(self):
+        o = pathlib.Path.cwd()
+        if o.name in ["src", "test"]:
+            o = o.parent
+        if not o.name == "run":
+            o = o / "run"
+        o.mkdir(exist_ok=True)
+        return o
+
+    def _helm_command_for_runspecs(
+        self, bbq_runspecs, max_instances, huggingface_models=None
+    ):
+        command = ["helm-run"]
+        command.extend(
+            ["--suite", "v1"]
+        )  # this is a fixed string for now, which is probably wrong
+        command.extend(["-n", "1"])  # working around a bug
+        if huggingface_models:
+            command.append("--enable-huggingface-models")
+            for m in huggingface_models:
+                command.append(m)
+        command.extend(["--max-eval-instances", str(max_instances)])
+
+        command.append("-r")
+        command.extend(bbq_runspecs)
+        return command
diff --git a/src/coffee/run.py b/src/coffee/run.py
index 9d934e54..bc4d32c4 100644
--- a/src/coffee/run.py
+++ b/src/coffee/run.py
@@ -1,174 +1,14 @@
-import json
+import logging
 import pathlib
-import re
-import subprocess
-from abc import abstractmethod, ABC
-from collections import defaultdict
-from enum import Enum
-from typing import List
 
 import click
-import jq
 
+import coffee
 from coffee.benchmark import Benchmark, RidiculousBenchmark
+from coffee.helm import HelmSut, BbqHelmTest, CliHelmRunner
 from coffee.static_site_generator import StaticSiteGenerator
 
 
-# This starts with a bunch of objects that represent things already in HELM code.
-# As we shift HELM to accommodate a library use case, it would be nice to compose
-# a run directly out of objects/enums/constants, or at least compose RunSpecs from
-# exposed pieces that are closely related. E.g., the BbqScenario should know "bbq".
-
-
-class HelmSut(Enum):
-    GPT2 = "huggingface/gpt2"
-
-
-class HelmTest(ABC):
-    # I would like this to be another enum, but BBQ's structural chaos means
-    # for now we need custom behavior
-    def __init__(self, prefix):
-        super().__init__()
-        self.prefix = prefix
-
-    @abstractmethod
-    def runspecs(self) -> List[str]:
-        pass
-
-
-class BbqHelmTest(HelmTest):
-    def __init__(self):
-        super().__init__("bbq")
-
-    # Copied from HELM because it's not exposed outside the method
-    CATEGORIES = [
-        "Age",
-        "Disability_status",
-        "Gender_identity",
-        "Nationality",
-        "Physical_appearance",
-        "Race_ethnicity",
-        "Race_x_SES",
-        "Race_x_gender",
-        "Religion",
-        "SES",
-        "Sexual_orientation",
-    ]
-
-    def runspecs(self) -> List[str]:
-        return [f"{self.prefix}:subject={c}" for c in BbqHelmTest.CATEGORIES]
-
-
-class HelmScores:
-    # a kinda hacky container; we won't know the right shape of this for a while, so just use wild dicts
-    def __init__(self):
-        super().__init__()
-        self.data = defaultdict(list)
-
-    def add(self, test, sut, test_sut_scores):
-        self.data[(test.__class__.__name__, sut)].append(test_sut_scores)
-
-    def for_sut(self, desired_sut) -> dict:
-        result: defaultdict = defaultdict(dict)
-        for test, sut in self.data:
-            if sut == desired_sut:
-                for entry in self.data[(test, sut)]:
-                    result[test].update(entry)
-
-        return result
-
-
-class HelmResult:
-    def __init__(
-        self,
-        tests: List[HelmTest],
-        suts: List[HelmSut],
-        output_dir: pathlib.Path,
-        execution_result: subprocess.CompletedProcess,
-    ):
-        super().__init__()
-        self.tests = tests
-        self.suts = suts
-        self.output_dir = output_dir
-        # TODO: make sure the execution succeeded
-
-    def load_scores(self):
-        focus = self.output_dir / "benchmark_output" / "runs" / "v1"
-        result = HelmScores()
-        for t in self.tests:
-            for s in self.suts:
-                # long term we'll need a lot more data; this is just enough to compute simple scores
-                test_sut_scores = {}
-                for d in focus.glob(
-                    f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.value)}*"
-                ):
-                    subject_result = {}
-                    with open(d / "run_spec.json") as f:
-                        j = json.load(f)
-                    subject = (
-                        jq.compile(".scenario_spec.args.subject").input_value(j).first()
-                    )
-                    with open(d / "stats.json") as f:
-                        j = json.load(f)
-                    for stat in j:
-                        if stat["name"]["name"].startswith("bbq_"):
-                            subject_result[stat["name"]["name"]] = stat["sum"]
-                    test_sut_scores[subject] = subject_result
-                result.add(t, s, test_sut_scores)
-        return result
-
-    def _filesystem_safe(self, s: str):
-        # reproducing some behavior in HELM; would be nice to remove duplication
-        return re.sub("/", "_", s)
-
-
-class HelmRunner(ABC):
-    @abstractmethod
-    def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10):
-        pass
-
-
-class CliHelmRunner(HelmRunner):
-    def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10):
-        runspecs = []
-        for s in suts:
-            for t in tests:
-                for r in t.runspecs():
-                    runspecs.append(r + ",model=" + s.value)
-
-        command = self._helm_command_for_runspecs(runspecs, max_instances)
-
-        output_dir = self._make_output_dir()
-        execute_result = self._execute(command, output_dir)
-        return HelmResult(tests, suts, output_dir, execute_result)
-
-    def _execute(self, command, output_dir):
-        return subprocess.run(
-            " ".join(command), shell=True, capture_output=True, cwd=output_dir
-        )
-
-    def _make_output_dir(self):
-        o = pathlib.Path.cwd()
-        if o.name in ["src", "test"]:
-            o = o.parent
-        if not o.name == "run":
-            o = o / "run"
-        o.mkdir(exist_ok=True)
-        return o
-
-    def _helm_command_for_runspecs(self, bbq_runspecs, max_instances):
-        command = ["helm-run"]
-        command.extend(
-            ["--suite", "v1"]
-        )  # this is fixed for now, which is probably wrong
-        command.extend(["-n", "1"])  # working around a bug
-        command.extend(["--max-eval-instances", str(max_instances)])
-
-        command.append("-r")
-        command.extend(bbq_runspecs)
-        return command
-
-
 def quantize_stars(raw_score):
     return round(2 * raw_score) / 2.0
 
@@ -181,10 +21,26 @@ def quantize_stars(raw_score):
     type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path),
 )
 @click.option("--max-instances", "-m", type=int, default=100)
-def cli(output_dir: pathlib.Path, max_instances: int):
+@click.option("--debug", default=False, is_flag=True)
+def cli(output_dir: pathlib.Path, max_instances: int, debug: bool) -> None:
+    coffee.app_config.debug = debug
+
+    if coffee.app_config.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+
     runner = CliHelmRunner()
-    suts = [HelmSut.GPT2]
-    result = runner.run([BbqHelmTest()], suts, max_instances=max_instances)
+    suts = [HelmSut.GPT2, HelmSut.PYTHIA_70M, HelmSut.FB_OPT_125M]
+    result = runner.run(RidiculousBenchmark.tests(), suts, max_instances=max_instances)
+    if not result.success():
+        print(
+            f"HELM execution failed with return code {result.execution_result.returncode}:"
+        )
+        print("stdout:")
+        print(result.helm_stdout())
+        print("stderr:")
+        print(result.helm_stderr())
     scores = result.load_scores()
     benchmarks: list[Benchmark] = []
     for sut in suts:
diff --git a/src/coffee/static_site_generator.py b/src/coffee/static_site_generator.py
index e5e40dda..5be03194 100644
--- a/src/coffee/static_site_generator.py
+++ b/src/coffee/static_site_generator.py
@@ -4,6 +4,7 @@
 from itertools import groupby
 from typing import Tuple
 
+import casefy
 from jinja2 import Environment, PackageLoader, select_autoescape
 
 from coffee.benchmark import Benchmark
@@ -60,11 +61,7 @@ def _static_dir(self):
         return self._template_dir() / "static"
 
     def _copy_static_dir(self, output_dir):
-        shutil.copytree(
-            self._static_dir(),
-            output_dir / "static",
-            dirs_exist_ok=True
-        )
+        shutil.copytree(self._static_dir(), output_dir / "static", dirs_exist_ok=True)
 
     def generate(self, benchmarks: list[Benchmark], output_dir: pathlib.Path) -> None:
         self._copy_static_dir(output_dir)
@@ -72,23 +69,29 @@ def generate(self, benchmarks: list[Benchmark], output_dir: pathlib.Path) -> Non
         benchmark_template = self.env.get_template("benchmark.html")
         index_template = self.env.get_template("index.html")
 
-        for benchmark_name, grouped_benchmarks in groupby(benchmarks, lambda x: x.__class__.__name__):
+        for benchmark_class, grouped_benchmarks in groupby(
+            benchmarks, lambda x: x.__class__
+        ):
             suts = {}
             for benchmark in grouped_benchmarks:
                 this_sut = suts[benchmark.sut.name] = {}
-                this_sut["stars"], this_sut["half_star"], this_sut["empty_stars"] = self.calculate_stars(benchmark)
+                (
+                    this_sut["stars"],
+                    this_sut["half_star"],
+                    this_sut["empty_stars"],
+                ) = self.calculate_stars(benchmark)
                 this_sut["name"] = benchmark.sut.name
 
             with open(
-                    pathlib.Path(
-                        output_dir, f"{benchmark_name.lower()}.html"
-                    ),
-                    "w+",
+                pathlib.Path(
+                    output_dir, f"{casefy.snakecase(benchmark_class.path_name())}.html"
+                ),
+                "w+",
             ) as f:
                 f.write(
                     benchmark_template.render(
                         suts=suts,
-                        benchmark_name=benchmark_name,
+                        benchmark_name=benchmark_class.name(),
                         benchmarks=benchmarks,
                         stars_description=STARS_DESCRIPTION,
                     )
@@ -99,4 +102,4 @@ def generate(self, benchmarks: list[Benchmark], output_dir: pathlib.Path) -> Non
                 index_template.render(
                     benchmarks=benchmarks, stars_description=STARS_DESCRIPTION
                 )
-            )
\ No newline at end of file
+            )
diff --git a/src/coffee/templates/base.html b/src/coffee/templates/base.html
index 4b6b371c..cbf8f804 100644
--- a/src/coffee/templates/base.html
+++ b/src/coffee/templates/base.html
@@ -23,8 +23,8 @@
                         Benchmarks
                     </a>
                     <ul class="dropdown-menu">
-                        {% for benchmark in benchmarks|unique(attribute="__class__.__name__") %}
-                            <li><a class="dropdown-item" href="{{ benchmark.__class__.__name__ | lower }}.html">{{ benchmark.__class__.__name__ }}</a></li>
+                        {% for benchmark in benchmarks|unique(attribute="__class__") %}
+                            <li><a class="dropdown-item" href="{{ benchmark.__class__.path_name() }}.html">{{ benchmark.__class__.name() }}</a></li>
                         {% endfor %}
                     </ul>
                 </li>
diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/per_instance_stats.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json
similarity index 100%
rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/per_instance_stats.json
rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json
diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/run_spec.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/run_spec.json
similarity index 100%
rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/run_spec.json
rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/run_spec.json
diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/scenario.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/scenario.json
similarity index 100%
rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/scenario.json
rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/scenario.json
diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/scenario_state.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json
similarity index 100%
rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/scenario_state.json
rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json
diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/stats.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/stats.json
similarity index 100%
rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/stats.json
rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/stats.json
diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/per_instance_stats.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json
similarity index 100%
rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/per_instance_stats.json
rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json
diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/run_spec.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/run_spec.json
similarity index 100%
rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/run_spec.json
rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/run_spec.json
diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/scenario.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/scenario.json
similarity index 100%
rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/scenario.json
rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/scenario.json
diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/scenario_state.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json
similarity index 100%
rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/scenario_state.json
rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json
diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/stats.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/stats.json
similarity index 100%
rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/stats.json
rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/stats.json
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
new file mode 100644
index 00000000..2c0d3c83
--- /dev/null
+++ b/tests/test_benchmark.py
@@ -0,0 +1,16 @@
+from coffee.helm import HelmSut
+from coffee.benchmark import RidiculousBenchmark
+
+
+def test_class_basics():
+    assert RidiculousBenchmark.name() == "Ridiculous Benchmark"
+    assert RidiculousBenchmark.path_name() == "ridiculous_benchmark"
+
+
+def test_instance_basics():
+    rb = RidiculousBenchmark(HelmSut.GPT2, {})
+    assert rb.name() == "Ridiculous Benchmark"
+    assert rb.path_name() == "ridiculous_benchmark"
+
+
+# not testing bechmark scoring for the moment because it's all throwaway
diff --git a/tests/test_helm_runner.py b/tests/test_helm_runner.py
index 1c48bfb1..46a29c42 100644
--- a/tests/test_helm_runner.py
+++ b/tests/test_helm_runner.py
@@ -5,13 +5,8 @@
 
 import pytest
 
-from coffee.run import (
-    CliHelmRunner,
-    BbqHelmTest,
-    HelmSut,
-    HelmResult,
-    quantize_stars,
-)
+from coffee.run import quantize_stars
+from coffee.helm import HelmSut, BbqHelmTest, HelmResult, CliHelmRunner
 from coffee.benchmark import RidiculousBenchmark
 
 
@@ -22,10 +17,24 @@ def test_cli_helm_runner_command(cwd_tmpdir):
     shell_arguments = runner._execute.call_args.args[0]
     assert "helm-run" == shell_arguments[0]
     runspecs = shell_arguments[shell_arguments.index("-r") + 1 :]
-    assert "bbq:subject=Age,model=huggingface/gpt2" == runspecs[0]
+    assert "bbq:subject=Age,model=openai/gpt2" == runspecs[0]
     assert len(BbqHelmTest.CATEGORIES) == len(runspecs)
 
 
+def test_cli_helm_runner_command_handles_huggingface_models(cwd_tmpdir):
+    runner = CliHelmRunner()
+    runner._execute = Mock()
+    # try one normal model, one magic huggingface model
+    runner.run([BbqHelmTest()], [HelmSut.GPT2, HelmSut.FB_OPT_125M, HelmSut.PYTHIA_70M])
+    shell_arguments = runner._execute.call_args.args[0]
+    enables = [
+        i for (i, s) in enumerate(shell_arguments) if s == "--enable-huggingface-models"
+    ]
+    assert len(enables) == 1
+    assert shell_arguments[enables[0] + 1] == HelmSut.FB_OPT_125M.key
+    assert shell_arguments[enables[0] + 2] == HelmSut.PYTHIA_70M.key
+
+
 @pytest.mark.datafiles(SIMPLE_BBQ_DATA)
 def test_read_scores(datafiles):
     hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None)
@@ -56,3 +65,14 @@ def test_quantize_stars():
     assert 4.5 == quantize_stars(4.6)
     assert 4.5 == quantize_stars(4.7)
     assert 4.5 == quantize_stars(4.74)
+
+
+def test_helmsut_basics():
+    assert HelmSut.GPT2.key == "openai/gpt2"
+    assert hash(HelmSut.GPT2) is not None
+
+
+def test_helmsut_huggingface():
+    assert HelmSut.GPT2.huggingface == False
+    assert HelmSut.FB_OPT_125M.huggingface == True
+    assert HelmSut.PYTHIA_70M.huggingface == True
diff --git a/tests/test_static_site_generator.py b/tests/test_static_site_generator.py
index 11dcb618..e6d8621d 100644
--- a/tests/test_static_site_generator.py
+++ b/tests/test_static_site_generator.py
@@ -4,11 +4,7 @@
 
 import pytest
 
-from coffee.run import (
-    BbqHelmTest,
-    HelmSut,
-    HelmResult,
-)
+from coffee.helm import HelmSut, BbqHelmTest, HelmResult
 from coffee.benchmark import RidiculousBenchmark
 from coffee.static_site_generator import StaticSiteGenerator
 
@@ -21,7 +17,9 @@ def benchmark(datafiles):
     return b
 
 
-@pytest.mark.parametrize("path", ["ridiculousbenchmark.html", "static/images/ml_commons_logo.png"])
+@pytest.mark.parametrize(
+    "path", ["ridiculous_benchmark.html", "static/images/ml_commons_logo.png"]
+)
 @pytest.mark.datafiles(SIMPLE_BBQ_DATA)
 def test_creates_files(benchmark, tmp_path, path):
     generator = StaticSiteGenerator()