diff --git a/.gitignore b/.gitignore index e158bf16..82ec436f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ .idea/ -run/ \ No newline at end of file +run/ +__pycache__/ +web/ \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index 3ef228dd..57f7fed2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -260,6 +260,17 @@ files = [ {file = "cachetools-5.3.2.tar.gz", hash = "sha256:086ee420196f7b2ab9ca2db2520aca326318b68fe5ba8bc4d49cca91add450f2"}, ] +[[package]] +name = "casefy" +version = "0.1.7" +description = "Utilities for string case conversion." +optional = false +python-versions = ">=3.6" +files = [ + {file = "casefy-0.1.7-py3-none-any.whl", hash = "sha256:ab05ff1c67f2a8e62d9f8986fa9a849416d61ac5413ec57d1f827b4f36589cf6"}, + {file = "casefy-0.1.7.tar.gz", hash = "sha256:6accce985a64b9edb2a610a29ac489d78fac80e52ff8f2d137e294f2f92b8027"}, +] + [[package]] name = "catalogue" version = "2.0.10" @@ -502,13 +513,13 @@ srsly = ">=2.4.0,<3.0.0" [[package]] name = "crfm-helm" -version = "0.3.0" +version = "0.4.0" description = "Benchmark for language models" optional = false python-versions = "<3.11,>=3.8" files = [ - {file = "crfm-helm-0.3.0.tar.gz", hash = "sha256:1a9b75e2e544e8c2fc497fa4bf32c581f6c93f833c969e17843eb0bcea1f98ca"}, - {file = "crfm_helm-0.3.0-py3-none-any.whl", hash = "sha256:6256ee699461bb7551476201fc3190b19160a758436fe4ccee563d3110d28498"}, + {file = "crfm-helm-0.4.0.tar.gz", hash = "sha256:06d49ad3c3c07eae67898e204c856b75e96a20c93e6cf8f20e56bce2c13cdaa3"}, + {file = "crfm_helm-0.4.0-py3-none-any.whl", hash = "sha256:3fc9c3721f78f48632cad6dfb04851de7055f83d900125fd8d247b9503a99a27"}, ] [package.dependencies] @@ -524,7 +535,6 @@ numpy = ">=1.23.3,<1.24.0" pyarrow = ">=11.0.0" pyext = ">=0.7,<1.0" pyhocon = ">=0.3.59,<0.4.0" -pymongo = ">=4.2.0,<4.3.0" retrying = ">=1.3.4,<1.4.0" rouge-score = ">=0.1.2,<0.2.0" scikit-learn = ">=1.1.2,<1.2.0" @@ -540,14 +550,16 @@ zstandard = ">=0.18.0,<0.19.0" [package.extras] aleph-alpha = ["aleph-alpha-client (>=2.14.0,<2.15.0)", "tokenizers (>=0.13.3,<0.14.0)"] -all = ["crfm-helm[cleva]", "crfm-helm[human-evaluation]", "crfm-helm[images]", "crfm-helm[metrics]", "crfm-helm[models]", "crfm-helm[plots]", "crfm-helm[proxy-server]", "crfm-helm[scenarios]", "crfm-helm[slurm]"] +all = ["crfm-helm[cleva]", "crfm-helm[human-evaluation]", "crfm-helm[images]", "crfm-helm[metrics]", "crfm-helm[models]", "crfm-helm[mongo]", "crfm-helm[plots]", "crfm-helm[proxy-server]", "crfm-helm[scenarios]", "crfm-helm[slurm]"] anthropic = ["anthropic (>=0.2.5,<0.3.0)", "websocket-client (>=1.3.2,<1.4.0)"] cleva = ["jieba (==0.42.1)", "langdetect (==1.0.9)", "opencc (==1.1.6)", "pypinyin (==0.49.0)", "unidecode (==1.3.6)"] -dev = ["black (>=22.10.0,<22.11.0)", "flake8 (>=5.0.4,<5.1.0)", "mypy (>=0.982,<1.0)", "pre-commit (>=2.20.0,<2.21.0)", "pytest (>=7.2.0,<7.3.0)"] +dev = ["black (>=22.10.0,<22.11.0)", "flake8 (>=5.0.4,<5.1.0)", "mypy (>=1.5.1,<1.6.0)", "pre-commit (>=2.20.0,<2.21.0)", "pytest (>=7.2.0,<7.3.0)"] +google = ["google-cloud-aiplatform (>=1.36.4,<1.37.0)"] human-evaluation = ["scaleapi (>=2.13.0,<2.14.0)", "surge-api (>=1.1.0,<1.2.0)"] images = ["accelerate (>=0.23.0,<0.24.0)", "pillow (>=9.4.0,<9.5.0)"] metrics = ["numba (>=0.56.4,<0.57.0)", "pytrec-eval (==0.5)", "sacrebleu (>=2.2.1,<2.3.0)", "summ-eval (>=0.892,<1.0)"] -models = ["crfm-helm[aleph-alpha]", "crfm-helm[anthropic]", "crfm-helm[openai]", "crfm-helm[tsinghua]", "crfm-helm[yandex]"] +models = ["crfm-helm[aleph-alpha]", "crfm-helm[anthropic]", "crfm-helm[google]", "crfm-helm[openai]", "crfm-helm[tsinghua]", "crfm-helm[yandex]"] +mongo = ["pymongo (>=4.2.0,<4.3.0)"] openai = ["openai (>=0.27.8,<0.28.0)", "tiktoken (>=0.3.3,<0.4.0)"] plots = ["colorcet (>=3.0.1,<3.1.0)", "matplotlib (>=3.6.0,<3.7.0)", "seaborn (>=0.11.0,<0.12.0)"] proxy-server = ["gunicorn (>=20.1.0,<20.2.0)"] @@ -1058,7 +1070,6 @@ files = [ {file = "jq-1.6.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:227b178b22a7f91ae88525810441791b1ca1fc71c86f03190911793be15cec3d"}, {file = "jq-1.6.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:780eb6383fbae12afa819ef676fc93e1548ae4b076c004a393af26a04b460742"}, {file = "jq-1.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:08ded6467f4ef89fec35b2bf310f210f8cd13fbd9d80e521500889edf8d22441"}, - {file = "jq-1.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:49e44ed677713f4115bd5bf2dbae23baa4cd503be350e12a1c1f506b0687848f"}, {file = "jq-1.6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:984f33862af285ad3e41e23179ac4795f1701822473e1a26bf87ff023e5a89ea"}, {file = "jq-1.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f42264fafc6166efb5611b5d4cb01058887d050a6c19334f6a3f8a13bb369df5"}, {file = "jq-1.6.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a67154f150aaf76cc1294032ed588436eb002097dd4fd1e283824bf753a05080"}, @@ -2102,91 +2113,6 @@ pyparsing = {version = ">=2,<4", markers = "python_version >= \"3.0\""} [package.extras] duration = ["python-dateutil (>=2.8.0)"] -[[package]] -name = "pymongo" -version = "4.2.0" -description = "Python driver for MongoDB " -optional = false -python-versions = ">=3.7" -files = [ - {file = "pymongo-4.2.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:b9e4981a65f8500a3a46bb3a1e81b9feb45cf0b2115ad9c4f8d517326d026940"}, - {file = "pymongo-4.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1c81414b706627f15e921e29ae2403aab52e33e36ed92ed989c602888d7c3b90"}, - {file = "pymongo-4.2.0-cp310-cp310-manylinux1_i686.whl", hash = "sha256:c549bb519456ee230e92f415c5b4d962094caac0fdbcc4ed22b576f66169764e"}, - {file = "pymongo-4.2.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:70216ec4c248213ae95ea499b6314c385ce01a5946c448fb22f6c8395806e740"}, - {file = "pymongo-4.2.0-cp310-cp310-manylinux2014_i686.whl", hash = "sha256:8a86e8c2ac2ec87141e1c6cb00bdb18a4560f06e5f96769abcd1dda24dc0e764"}, - {file = "pymongo-4.2.0-cp310-cp310-manylinux2014_ppc64le.whl", hash = "sha256:314b556afd72eb21a6a10bd1f45ef252509f014f80207db59c97372103c88237"}, - {file = "pymongo-4.2.0-cp310-cp310-manylinux2014_s390x.whl", hash = "sha256:902e2c9030cb042c49750bc70d72d830d42c64ea0df5ff8630c171e065c93dd7"}, - {file = "pymongo-4.2.0-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:c69ef5906dcd6ec565d4d887ba97ceb2a84f3b614307ee3b4780cb1ea40b1867"}, - {file = "pymongo-4.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07564178ecc203a84f63e72972691af6c0c82d2dc0c9da66ba711695276089ba"}, - {file = "pymongo-4.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f47d5f10922cf7f7dfcd1406bd0926cef6d866a75953c3745502dffd7ac197dd"}, - {file = "pymongo-4.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4cadaaa5c19ad23fc84559e90284f2eb003c36958ebb2c06f286b678f441285f"}, - {file = "pymongo-4.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d94f535df9f539615bc3dbbef185ded3b609373bb44ca1afffcabac70202678a"}, - {file = "pymongo-4.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:147a23cd96feb67606ac957744d8d25b013426cdc3c7164a4f99bd8253f649e3"}, - {file = "pymongo-4.2.0-cp310-cp310-win32.whl", hash = "sha256:ecdcb0d4e9b08b739035f57a09330efc6f464bd7f942b63897395d996ca6ebd5"}, - {file = "pymongo-4.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:8c223aea52c359cc8fdee5bd3475532590755c269ec4d4fe581acd47a44e9952"}, - {file = "pymongo-4.2.0-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:fe0820d169635e41c14a5d21514282e0b93347878666ec9d5d3bf0eed0649948"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:e39cacee70a98758f9b2da53ee175378f07c60113b1fa4fae40cbaee5583181e"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:701d331060dae72bf3ebdb82924405d14136a69282ccb00c89fc69dee21340b4"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:e08fe1731f5429435b8dea1db9663f9ed1812915ff803fc9991c7c4841ed62ad"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:60c470a58c5b62b1b12a5f5458f8e2f2f67b94e198d03dc5352f854d9230c394"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:b211e161b6cc2790e0d640ad38e0429d06c944e5da23410f4dc61809dba25095"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:ed90a9de4431cbfb2f3b2ef0c5fd356e61c85117b2be4db3eae28cb409f6e2d5"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:68e1e49a5675748233f7b05330f092582cd52f2850b4244939fd75ba640593ed"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:764fc15418d94bce5c2f8ebdbf66544f96f42efb1364b61e715e5b33281b388d"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e64442aba81ed4df1ca494b87bf818569a1280acaa73071c68014f7a884e83f1"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:83168126ae2457d1a19b2af665cafa7ef78c2dcff192d7d7b5dad6b36c73ae24"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69d0180bca594e81cdb4a2af328bdb4046f59e10aaeef7619496fe64f2ec918c"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:80cbf0b043061451660099fff9001a7faacb2c9c983842b4819526e2f944dc6c"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e1b8f5e2f9637492b0da4d51f78ecb17786e61d6c461ead8542c944750faf4f9"}, - {file = "pymongo-4.2.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1a957cdc2b26eeed4d8f1889a40c6023dd1bd94672dd0f5ce327314f2caaefd4"}, - {file = "pymongo-4.2.0-cp37-cp37m-win32.whl", hash = "sha256:6bd5888997ea3eae9830c6cc7964b61dcfbc50eb3a5a6ce56ad5f86d5579b11c"}, - {file = "pymongo-4.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:dc24737d24ce0de762bee9c2a884639819485f679bbac8ab5be9c161ef6f9b2c"}, - {file = "pymongo-4.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:193cc97d44b1e6d2253ea94e30c6f94f994efb7166e2452af4df55825266e88b"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:e152c26ffc30331e9d57591fc4c05453c209aa20ba299d1deb7173f7d1958c22"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8a9bc4dcfc2bda69ee88cdb7a89b03f2b8eca668519b704384a264dea2db4209"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:8cbb868e88c4eee1c53364bb343d226a3c0e959e791e6828030cb78f46cfcbe3"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:2bfe6b59f431f40fa545547616f4acf0c0c4b64518b1f951083e3bad06eb368b"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:ff66014687598823b6b23751884b4aa67eb934445406d95894dfc60cb7bfcc18"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:31c50da4a080166bc29403aa91f4c76e0889b4f24928d1b60508a37c1bf87f9a"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:ccfdc7722df445c49dc6b5d514c3544cad99b53189165f7546793933050ac7fb"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc7ebc37b03956a070260665079665eae69e5e96007694214f3a2107af96816a"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8b4a782aac43948308087b962c9ecb030ba98886ce6dee3ad7aafe8c5e1ce80"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1c23527f8e13f526fededbb96f2e7888f179fe27c51d41c2724f7059b75b2fa"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83cc3c35aeeceb67143914db67f685206e1aa37ea837d872f4bc28d7f80917c9"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e09cdf5aad507c8faa30d97884cc42932ed3a9c2b7f22cc3ccc607bae03981b3"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0f53253f4777cbccc426e669a2af875f26c95bd090d88593287b9a0a8ac7fa25"}, - {file = "pymongo-4.2.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:21238b19243a42f9a34a6d39e7580ceebc6da6d2f3cf729c1cff9023cb61a5f1"}, - {file = "pymongo-4.2.0-cp38-cp38-win32.whl", hash = "sha256:766acb5b1a19eae0f7467bcd3398748f110ea5309cdfc59faa5185dcc7fd4dca"}, - {file = "pymongo-4.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:10f09c4f09757c2e2a707ad7304f5d69cb8fdf7cbfb644dbacfe5bbe8afe311b"}, - {file = "pymongo-4.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a6bf01b9237f794fa3bdad5089474067d28be7e199b356a18d3f247a45775f26"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:d8bb745321716e7a11220a67c88212ecedde4021e1de4802e563baef9df921d2"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:3be53e9888e759c49ae35d747ff77a04ff82b894dd64601e0f3a5a159b406245"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:a3efdf154844244e0dabe902cf1827fdced55fa5b144adec2a86e5ce50a99b97"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:a7eb5b06744b911b6668b427c8abc71b6d624e72d3dfffed00988fa1b4340f97"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:b0be613d926c5dbb0d3fc6b58e4f2be4979f80ae76fda6e47309f011b388fe0c"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:e7dcb73f683c155885a3488646fcead3a895765fed16e93c9b80000bc69e96cb"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:b537dd282de1b53d9ae7cf9f3df36420c8618390f2da92100391f3ba8f3c141a"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d98d2a8283c9928a9e5adf2f3c0181e095579e9732e1613aaa55d386e2bcb6c5"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:76892bbce743eb9f90360b3626ea92f13d338010a1004b4488e79e555b339921"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:124d0e880b66f9b0778613198e89984984fdd37a3030a9007e5f459a42dfa2d3"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:773467d25c293f8e981b092361dab5fd800e1ba318403b7959d35004c67faedc"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6673ab3fbf3135cc1a8c0f70d480db5b2378c3a70af8d602f73f76b8338bdf97"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:153b8f8705970756226dfeeb7bb9637e0ad54a4d79b480b4c8244e34e16e1662"}, - {file = "pymongo-4.2.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:01721da74558f2f64a9f162ee063df403ed656b7d84229268d8e4ae99cfba59c"}, - {file = "pymongo-4.2.0-cp39-cp39-win32.whl", hash = "sha256:a25c0eb2d610b20e276e684be61c337396813b636b69373c17314283cb1a3b14"}, - {file = "pymongo-4.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:44b36ccb90aac5ea50be23c1a6e8f24fbfc78afabdef114af16c6e0a80981364"}, - {file = "pymongo-4.2.0.tar.gz", hash = "sha256:72f338f6aabd37d343bd9d1fdd3de921104d395766bcc5cdc4039e4c2dd97766"}, -] - -[package.extras] -aws = ["pymongo-auth-aws (<2.0.0)"] -encryption = ["pymongocrypt (>=1.3.0,<2.0.0)"] -gssapi = ["pykerberos"] -ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identity (>=18.1.0)"] -snappy = ["python-snappy"] -srv = ["dnspython (>=1.16.0,<3.0.0)"] -zstd = ["zstandard"] - [[package]] name = "pyparsing" version = "3.1.1" @@ -2274,6 +2200,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -2281,8 +2208,15 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -2299,6 +2233,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -2306,6 +2241,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -3653,4 +3589,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.11" -content-hash = "a936c5de01f305bdec3631d36eebc375418549fca1be0ad6c7ac2fa810ca151a" +content-hash = "5cba43411b92ed164c4bbc30fbab94b73a41acdeb14fc104e4915a48e04f189b" diff --git a/pyproject.toml b/pyproject.toml index b240e61f..a2862d71 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,9 +12,10 @@ packages = [ [tool.poetry.dependencies] python = ">=3.10,<3.11" pyext = {url = "https://files.pythonhosted.org/packages/b0/be/9b6005ac644aaef022527ce49617263379e49dbdbd433d1d3dd66d71f570/pyext-0.7.tar.gz"} -crfm-helm = "0.3" +crfm-helm = "^0.4.0" jq = "^1.6.0" click = "^8.1.7" +casefy = "^0.1.7" [tool.poetry.group.dev.dependencies] pytest-datafiles = "^3.0.0" diff --git a/src/coffee/__init__.py b/src/coffee/__init__.py index e69de29b..4e0f618a 100644 --- a/src/coffee/__init__.py +++ b/src/coffee/__init__.py @@ -0,0 +1,6 @@ +class CoffeeConfig: + def __init__(self): + self.debug = False + + +app_config = CoffeeConfig() diff --git a/src/coffee/benchmark.py b/src/coffee/benchmark.py index bc8db834..0a68b60c 100644 --- a/src/coffee/benchmark.py +++ b/src/coffee/benchmark.py @@ -1,4 +1,9 @@ from abc import ABC, abstractmethod +from typing import List + +import casefy + +from coffee.helm import BbqHelmTest, HelmTest class Benchmark(ABC): @@ -11,6 +16,18 @@ def __init__(self, sut, scores): def overall_score(self) -> float: pass + @abstractmethod + def tests(self) -> List[HelmTest]: + pass + + @classmethod + def name(cls): + return casefy.titlecase(cls.__name__) + + @classmethod + def path_name(cls): + return casefy.snakecase(cls.__name__) + class RidiculousBenchmark(Benchmark): def overall_score(self) -> float: @@ -21,3 +38,7 @@ def overall_score(self) -> float: count += 1 total += bbq[subject]["bbq_accuracy"] return total / count * 5 + + @classmethod + def tests(cls) -> List[HelmTest]: + return [BbqHelmTest()] diff --git a/src/coffee/helm.py b/src/coffee/helm.py new file mode 100644 index 00000000..c7c3c60e --- /dev/null +++ b/src/coffee/helm.py @@ -0,0 +1,225 @@ +import dataclasses +import json +import logging +import pathlib +import re +import subprocess +from abc import ABC, abstractmethod +from collections import defaultdict +from enum import Enum +from typing import List + +import jq + +import coffee + +# This starts with a bunch of objects that represent things already in HELM code. +# As we shift HELM to accommodate a library use case, it would be nice to compose +# a run directly out of objects/enums/constants, or at least compose RunSpecs from +# exposed pieces that are closely related. E.g., the BbqScenario should know "bbq". + + +@dataclasses.dataclass +class SutDescription: + key: str + huggingface: bool = dataclasses.field(repr=False, default=False) + + def __hash__(self): + return super().__hash__() + + +class HelmSut(SutDescription, Enum): + GPT2 = "openai/gpt2" + PYTHIA_70M = "EleutherAI/pythia-70m", True + FB_OPT_125M = "facebook/opt-125m", True + + +class HelmTest(ABC): + # I would like this to be another enum, but BBQ's structural chaos means + # for now we need custom behavior + def __init__(self, prefix): + super().__init__() + self.prefix = prefix + + @abstractmethod + def runspecs(self) -> List[str]: + pass + + def __str__(self): + return self.__class__.__name__ + ":" + self.prefix + + +class BbqHelmTest(HelmTest): + def __init__(self): + super().__init__("bbq") + + # Copied from HELM because it's not exposed outside the method + CATEGORIES = [ + "Age", + "Disability_status", + "Gender_identity", + "Nationality", + "Physical_appearance", + "Race_ethnicity", + "Race_x_SES", + "Race_x_gender", + "Religion", + "SES", + # "Sexual_orientation", TODO: temporarily disabled while Yifan looks into a transformer-related bug + ] + + def runspecs(self) -> List[str]: + return [f"{self.prefix}:subject={c}" for c in BbqHelmTest.CATEGORIES] + + +class HelmScores: + # a kinda hacky container; we won't know the right shape of this for a while, so just use wild dicts + def __init__(self): + super().__init__() + self.data = defaultdict(list) + + def add(self, test, sut, test_sut_scores): + self.data[(test.__class__.__name__, sut)].append(test_sut_scores) + + def for_sut(self, desired_sut) -> dict: + result: defaultdict = defaultdict(dict) + for test, sut in self.data: + if sut == desired_sut: + for entry in self.data[(test, sut)]: + result[test].update(entry) + + return result + + +class HelmResult: + def __init__( + self, + tests: List[HelmTest], + suts: List[HelmSut], + output_dir: pathlib.Path, + execution_result: subprocess.CompletedProcess, + ): + super().__init__() + self.tests = tests + self.suts = suts + self.output_dir = output_dir + self.execution_result = execution_result + + def load_scores(self): + focus = self.output_dir / "benchmark_output" / "runs" / "v1" + result = HelmScores() + for t in self.tests: + for s in self.suts: + # long term we'll need a lot more data; this is just enough to compute simple scores + test_sut_scores = {} + glob_path = f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.key)}*" + logging.debug(f"looking for scores for {t} {s} in {focus}/{glob_path}") + for d in focus.glob(glob_path): + subject_result = {} + with open(d / "run_spec.json") as f: + j = json.load(f) + subject = ( + jq.compile(".scenario_spec.args.subject").input_value(j).first() + ) + with open(d / "stats.json") as f: + j = json.load(f) + for stat in j: + if stat["name"]["name"].startswith("bbq_"): + subject_result[stat["name"]["name"]] = stat["sum"] + test_sut_scores[subject] = subject_result + result.add(t, s, test_sut_scores) + return result + + def helm_stdout(self) -> str: + return self._deal_with_bytes(self.execution_result.stdout) + + def helm_stderr(self) -> str: + return self._deal_with_bytes(self.execution_result.stderr) + + def _deal_with_bytes(self, o): + if isinstance(o, bytes): + result = o.decode("utf-8") + else: + result = str(o) + return result + + def _filesystem_safe(self, s: str): + # reproducing some behavior in HELM; would be nice to remove duplication + return re.sub("/", "_", s) + + def success(self): + return self.execution_result and self.execution_result.returncode == 0 + + +class HelmRunner(ABC): + @abstractmethod + def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10): + pass + + +class CliHelmRunner(HelmRunner): + def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10): + runspecs = [] + for s in suts: + for t in tests: + for r in t.runspecs(): + runspecs.append(r + ",model=" + s.key) + huggingface_models = [s.key for s in suts if s.huggingface] + + command = self._helm_command_for_runspecs( + runspecs, max_instances, huggingface_models + ) + logging.debug(f"helm run command: {command}") + + output_dir = self._make_output_dir() + execute_result = self._execute(command, output_dir) + return HelmResult(tests, suts, output_dir, execute_result) + + def _execute( + self, command: List[str], output_dir: pathlib.Path + ) -> subprocess.CompletedProcess: + if coffee.app_config.debug: + return self._run_with_debug_settings(command, output_dir) + else: + return subprocess.run( + " ".join(command), shell=True, capture_output=True, cwd=output_dir + ) + + def _run_with_debug_settings(self, command, output_dir): + with subprocess.Popen( + " ".join(command), + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + cwd=output_dir, + ) as sp: + for line in sp.stdout: + logging.debug(line.decode().rstrip()) + return subprocess.CompletedProcess(sp.args, sp.returncode, sp.stdout, sp.stderr) + + def _make_output_dir(self): + o = pathlib.Path.cwd() + if o.name in ["src", "test"]: + o = o.parent + if not o.name == "run": + o = o / "run" + o.mkdir(exist_ok=True) + return o + + def _helm_command_for_runspecs( + self, bbq_runspecs, max_instances, huggingface_models=None + ): + command = ["helm-run"] + command.extend( + ["--suite", "v1"] + ) # this is a fixed string for now, which is probably wrong + command.extend(["-n", "1"]) # working around a bug + if huggingface_models: + command.append("--enable-huggingface-models") + for m in huggingface_models: + command.append(m) + command.extend(["--max-eval-instances", str(max_instances)]) + + command.append("-r") + command.extend(bbq_runspecs) + return command diff --git a/src/coffee/run.py b/src/coffee/run.py index 9d934e54..bc4d32c4 100644 --- a/src/coffee/run.py +++ b/src/coffee/run.py @@ -1,174 +1,14 @@ -import json +import logging import pathlib -import re -import subprocess -from abc import abstractmethod, ABC -from collections import defaultdict -from enum import Enum -from typing import List import click -import jq +import coffee from coffee.benchmark import Benchmark, RidiculousBenchmark +from coffee.helm import HelmSut, BbqHelmTest, CliHelmRunner from coffee.static_site_generator import StaticSiteGenerator -# This starts with a bunch of objects that represent things already in HELM code. -# As we shift HELM to accommodate a library use case, it would be nice to compose -# a run directly out of objects/enums/constants, or at least compose RunSpecs from -# exposed pieces that are closely related. E.g., the BbqScenario should know "bbq". - - -class HelmSut(Enum): - GPT2 = "huggingface/gpt2" - - -class HelmTest(ABC): - # I would like this to be another enum, but BBQ's structural chaos means - # for now we need custom behavior - def __init__(self, prefix): - super().__init__() - self.prefix = prefix - - @abstractmethod - def runspecs(self) -> List[str]: - pass - - -class BbqHelmTest(HelmTest): - def __init__(self): - super().__init__("bbq") - - # Copied from HELM because it's not exposed outside the method - CATEGORIES = [ - "Age", - "Disability_status", - "Gender_identity", - "Nationality", - "Physical_appearance", - "Race_ethnicity", - "Race_x_SES", - "Race_x_gender", - "Religion", - "SES", - "Sexual_orientation", - ] - - def runspecs(self) -> List[str]: - return [f"{self.prefix}:subject={c}" for c in BbqHelmTest.CATEGORIES] - - -class HelmScores: - # a kinda hacky container; we won't know the right shape of this for a while, so just use wild dicts - def __init__(self): - super().__init__() - self.data = defaultdict(list) - - def add(self, test, sut, test_sut_scores): - self.data[(test.__class__.__name__, sut)].append(test_sut_scores) - - def for_sut(self, desired_sut) -> dict: - result: defaultdict = defaultdict(dict) - for test, sut in self.data: - if sut == desired_sut: - for entry in self.data[(test, sut)]: - result[test].update(entry) - - return result - - -class HelmResult: - def __init__( - self, - tests: List[HelmTest], - suts: List[HelmSut], - output_dir: pathlib.Path, - execution_result: subprocess.CompletedProcess, - ): - super().__init__() - self.tests = tests - self.suts = suts - self.output_dir = output_dir - # TODO: make sure the execution succeeded - - def load_scores(self): - focus = self.output_dir / "benchmark_output" / "runs" / "v1" - result = HelmScores() - for t in self.tests: - for s in self.suts: - # long term we'll need a lot more data; this is just enough to compute simple scores - test_sut_scores = {} - for d in focus.glob( - f"{self._filesystem_safe(t.prefix)}:*model={self._filesystem_safe(s.value)}*" - ): - subject_result = {} - with open(d / "run_spec.json") as f: - j = json.load(f) - subject = ( - jq.compile(".scenario_spec.args.subject").input_value(j).first() - ) - with open(d / "stats.json") as f: - j = json.load(f) - for stat in j: - if stat["name"]["name"].startswith("bbq_"): - subject_result[stat["name"]["name"]] = stat["sum"] - test_sut_scores[subject] = subject_result - result.add(t, s, test_sut_scores) - return result - - def _filesystem_safe(self, s: str): - # reproducing some behavior in HELM; would be nice to remove duplication - return re.sub("/", "_", s) - - -class HelmRunner(ABC): - @abstractmethod - def run(self, tests: List[HelmTest], models: List[HelmSut], max_instances=10): - pass - - -class CliHelmRunner(HelmRunner): - def run(self, tests: List[HelmTest], suts: List[HelmSut], max_instances=10): - runspecs = [] - for s in suts: - for t in tests: - for r in t.runspecs(): - runspecs.append(r + ",model=" + s.value) - - command = self._helm_command_for_runspecs(runspecs, max_instances) - - output_dir = self._make_output_dir() - execute_result = self._execute(command, output_dir) - return HelmResult(tests, suts, output_dir, execute_result) - - def _execute(self, command, output_dir): - return subprocess.run( - " ".join(command), shell=True, capture_output=True, cwd=output_dir - ) - - def _make_output_dir(self): - o = pathlib.Path.cwd() - if o.name in ["src", "test"]: - o = o.parent - if not o.name == "run": - o = o / "run" - o.mkdir(exist_ok=True) - return o - - def _helm_command_for_runspecs(self, bbq_runspecs, max_instances): - command = ["helm-run"] - command.extend( - ["--suite", "v1"] - ) # this is fixed for now, which is probably wrong - command.extend(["-n", "1"]) # working around a bug - command.extend(["--max-eval-instances", str(max_instances)]) - - command.append("-r") - command.extend(bbq_runspecs) - return command - - def quantize_stars(raw_score): return round(2 * raw_score) / 2.0 @@ -181,10 +21,26 @@ def quantize_stars(raw_score): type=click.Path(file_okay=False, dir_okay=True, path_type=pathlib.Path), ) @click.option("--max-instances", "-m", type=int, default=100) -def cli(output_dir: pathlib.Path, max_instances: int): +@click.option("--debug", default=False, is_flag=True) +def cli(output_dir: pathlib.Path, max_instances: int, debug: bool) -> None: + coffee.app_config.debug = debug + + if coffee.app_config.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + runner = CliHelmRunner() - suts = [HelmSut.GPT2] - result = runner.run([BbqHelmTest()], suts, max_instances=max_instances) + suts = [HelmSut.GPT2, HelmSut.PYTHIA_70M, HelmSut.FB_OPT_125M] + result = runner.run(RidiculousBenchmark.tests(), suts, max_instances=max_instances) + if not result.success(): + print( + f"HELM execution failed with return code {result.execution_result.returncode}:" + ) + print("stdout:") + print(result.helm_stdout()) + print("stderr:") + print(result.helm_stderr()) scores = result.load_scores() benchmarks: list[Benchmark] = [] for sut in suts: diff --git a/src/coffee/static_site_generator.py b/src/coffee/static_site_generator.py index e5e40dda..5be03194 100644 --- a/src/coffee/static_site_generator.py +++ b/src/coffee/static_site_generator.py @@ -4,6 +4,7 @@ from itertools import groupby from typing import Tuple +import casefy from jinja2 import Environment, PackageLoader, select_autoescape from coffee.benchmark import Benchmark @@ -60,11 +61,7 @@ def _static_dir(self): return self._template_dir() / "static" def _copy_static_dir(self, output_dir): - shutil.copytree( - self._static_dir(), - output_dir / "static", - dirs_exist_ok=True - ) + shutil.copytree(self._static_dir(), output_dir / "static", dirs_exist_ok=True) def generate(self, benchmarks: list[Benchmark], output_dir: pathlib.Path) -> None: self._copy_static_dir(output_dir) @@ -72,23 +69,29 @@ def generate(self, benchmarks: list[Benchmark], output_dir: pathlib.Path) -> Non benchmark_template = self.env.get_template("benchmark.html") index_template = self.env.get_template("index.html") - for benchmark_name, grouped_benchmarks in groupby(benchmarks, lambda x: x.__class__.__name__): + for benchmark_class, grouped_benchmarks in groupby( + benchmarks, lambda x: x.__class__ + ): suts = {} for benchmark in grouped_benchmarks: this_sut = suts[benchmark.sut.name] = {} - this_sut["stars"], this_sut["half_star"], this_sut["empty_stars"] = self.calculate_stars(benchmark) + ( + this_sut["stars"], + this_sut["half_star"], + this_sut["empty_stars"], + ) = self.calculate_stars(benchmark) this_sut["name"] = benchmark.sut.name with open( - pathlib.Path( - output_dir, f"{benchmark_name.lower()}.html" - ), - "w+", + pathlib.Path( + output_dir, f"{casefy.snakecase(benchmark_class.path_name())}.html" + ), + "w+", ) as f: f.write( benchmark_template.render( suts=suts, - benchmark_name=benchmark_name, + benchmark_name=benchmark_class.name(), benchmarks=benchmarks, stars_description=STARS_DESCRIPTION, ) @@ -99,4 +102,4 @@ def generate(self, benchmarks: list[Benchmark], output_dir: pathlib.Path) -> Non index_template.render( benchmarks=benchmarks, stars_description=STARS_DESCRIPTION ) - ) \ No newline at end of file + ) diff --git a/src/coffee/templates/base.html b/src/coffee/templates/base.html index 4b6b371c..cbf8f804 100644 --- a/src/coffee/templates/base.html +++ b/src/coffee/templates/base.html @@ -23,8 +23,8 @@ Benchmarks diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/per_instance_stats.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json similarity index 100% rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/per_instance_stats.json rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/run_spec.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/run_spec.json similarity index 100% rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/run_spec.json rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/run_spec.json diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/scenario.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/scenario.json similarity index 100% rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/scenario.json rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/scenario.json diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/scenario_state.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json similarity index 100% rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/scenario_state.json rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/stats.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/stats.json similarity index 100% rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=huggingface_gpt2/stats.json rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Age,method=multiple_choice_joint,model=openai_gpt2/stats.json diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/per_instance_stats.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json similarity index 100% rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/per_instance_stats.json rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/per_instance_stats.json diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/run_spec.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/run_spec.json similarity index 100% rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/run_spec.json rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/run_spec.json diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/scenario.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/scenario.json similarity index 100% rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/scenario.json rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/scenario.json diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/scenario_state.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json similarity index 100% rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/scenario_state.json rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/scenario_state.json diff --git a/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/stats.json b/tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/stats.json similarity index 100% rename from tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=huggingface_gpt2/stats.json rename to tests/data/full_runs/simple_bbq/benchmark_output/runs/v1/bbq:subject=Disability_status,method=multiple_choice_joint,model=openai_gpt2/stats.json diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py new file mode 100644 index 00000000..2c0d3c83 --- /dev/null +++ b/tests/test_benchmark.py @@ -0,0 +1,16 @@ +from coffee.helm import HelmSut +from coffee.benchmark import RidiculousBenchmark + + +def test_class_basics(): + assert RidiculousBenchmark.name() == "Ridiculous Benchmark" + assert RidiculousBenchmark.path_name() == "ridiculous_benchmark" + + +def test_instance_basics(): + rb = RidiculousBenchmark(HelmSut.GPT2, {}) + assert rb.name() == "Ridiculous Benchmark" + assert rb.path_name() == "ridiculous_benchmark" + + +# not testing bechmark scoring for the moment because it's all throwaway diff --git a/tests/test_helm_runner.py b/tests/test_helm_runner.py index 1c48bfb1..46a29c42 100644 --- a/tests/test_helm_runner.py +++ b/tests/test_helm_runner.py @@ -5,13 +5,8 @@ import pytest -from coffee.run import ( - CliHelmRunner, - BbqHelmTest, - HelmSut, - HelmResult, - quantize_stars, -) +from coffee.run import quantize_stars +from coffee.helm import HelmSut, BbqHelmTest, HelmResult, CliHelmRunner from coffee.benchmark import RidiculousBenchmark @@ -22,10 +17,24 @@ def test_cli_helm_runner_command(cwd_tmpdir): shell_arguments = runner._execute.call_args.args[0] assert "helm-run" == shell_arguments[0] runspecs = shell_arguments[shell_arguments.index("-r") + 1 :] - assert "bbq:subject=Age,model=huggingface/gpt2" == runspecs[0] + assert "bbq:subject=Age,model=openai/gpt2" == runspecs[0] assert len(BbqHelmTest.CATEGORIES) == len(runspecs) +def test_cli_helm_runner_command_handles_huggingface_models(cwd_tmpdir): + runner = CliHelmRunner() + runner._execute = Mock() + # try one normal model, one magic huggingface model + runner.run([BbqHelmTest()], [HelmSut.GPT2, HelmSut.FB_OPT_125M, HelmSut.PYTHIA_70M]) + shell_arguments = runner._execute.call_args.args[0] + enables = [ + i for (i, s) in enumerate(shell_arguments) if s == "--enable-huggingface-models" + ] + assert len(enables) == 1 + assert shell_arguments[enables[0] + 1] == HelmSut.FB_OPT_125M.key + assert shell_arguments[enables[0] + 2] == HelmSut.PYTHIA_70M.key + + @pytest.mark.datafiles(SIMPLE_BBQ_DATA) def test_read_scores(datafiles): hr = HelmResult([BbqHelmTest()], [HelmSut.GPT2], datafiles, None) @@ -56,3 +65,14 @@ def test_quantize_stars(): assert 4.5 == quantize_stars(4.6) assert 4.5 == quantize_stars(4.7) assert 4.5 == quantize_stars(4.74) + + +def test_helmsut_basics(): + assert HelmSut.GPT2.key == "openai/gpt2" + assert hash(HelmSut.GPT2) is not None + + +def test_helmsut_huggingface(): + assert HelmSut.GPT2.huggingface == False + assert HelmSut.FB_OPT_125M.huggingface == True + assert HelmSut.PYTHIA_70M.huggingface == True diff --git a/tests/test_static_site_generator.py b/tests/test_static_site_generator.py index 11dcb618..e6d8621d 100644 --- a/tests/test_static_site_generator.py +++ b/tests/test_static_site_generator.py @@ -4,11 +4,7 @@ import pytest -from coffee.run import ( - BbqHelmTest, - HelmSut, - HelmResult, -) +from coffee.helm import HelmSut, BbqHelmTest, HelmResult from coffee.benchmark import RidiculousBenchmark from coffee.static_site_generator import StaticSiteGenerator @@ -21,7 +17,9 @@ def benchmark(datafiles): return b -@pytest.mark.parametrize("path", ["ridiculousbenchmark.html", "static/images/ml_commons_logo.png"]) +@pytest.mark.parametrize( + "path", ["ridiculous_benchmark.html", "static/images/ml_commons_logo.png"] +) @pytest.mark.datafiles(SIMPLE_BBQ_DATA) def test_creates_files(benchmark, tmp_path, path): generator = StaticSiteGenerator()