Skip to content

Commit

Permalink
Huggingface action.
Browse files Browse the repository at this point in the history
  • Loading branch information
buhrmann committed Feb 25, 2024
1 parent 23698ba commit 51c0b3b
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 5 deletions.
41 changes: 41 additions & 0 deletions .github/workflows/hfhub.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
---
name: HfHub

"on":
workflow_dispatch:
schedule:
- cron: "0 8 * * *"
push:
branches: main
paths:
- pyrennial/hfhub.py

jobs:
models-datasets:
name: Collect all models and datasets from huggingface hub
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.9'

- name: Install self
run: python -m pip install .

- name: fetch
run: |-
python -m pyrennial.hfhub
- name: commit
run: |-
git config user.name "GitHub Actions Bot"
git config user.email "[email protected]"
git add datasets
git commit -m "Update huggingface hub datasets."
git push origin main
Binary file added datasets/hfhub.csv.zip
Binary file not shown.
12 changes: 11 additions & 1 deletion pyrennial/hfhub.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@
from __future__ import annotations

import dataclasses as dc
import os
import zipfile

import huggingface_hub as hh
import pandas as pd
from huggingface_hub.hf_api import DatasetInfo, ModelInfo
from pandas import DataFrame
from tqdm.auto import tqdm

MODELCARD_FEATURES = {
"tags": "card_tags",
Expand Down Expand Up @@ -55,6 +58,7 @@ def get_multivalued(card: dict | None, key: str) -> list:
def models() -> DataFrame:
"""Create a DataFrame with all models."""
models = (to_dict(model) for model in hh.list_models(full=True, cardData=True))
models = tqdm(models, desc="Fetching model metadata")
df = pd.DataFrame(models)

for key, name in MODELCARD_FEATURES.items():
Expand All @@ -70,7 +74,8 @@ def models() -> DataFrame:
def datasets() -> DataFrame:
"""Create a DataFrame with all datasets."""
dsets = (to_dict(ds) for ds in hh.list_datasets(full=True))
df = pd.DataFrame(dsets)
dsets = tqdm(dsets, desc="Fetching dataset metadata")
df = pd.DataFrame(tqdm(dsets))

for key, name in DATACARD_FEATURES.items():
df[name] = df.card_data.apply(lambda card: get_multivalued(card, key))
Expand All @@ -85,3 +90,8 @@ def models_datasets() -> DataFrame:
ms = models()
ds = datasets()
return pd.concat([ms, ds], axis=0)


if __name__ == "__main__":
df = models_datasets()
df.to_csv("./datasets/hfhub.csv.zip", index=False)
7 changes: 3 additions & 4 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = pyrennial
version = attr: lector.__pyrennial__
version = attr: pyrennial.__version__
description = Evergreen datasets
long_description = file: README.md
long_description_content_type = text/markdown; charset=UTF-8
Expand Down Expand Up @@ -28,7 +28,7 @@ platforms = any
python_requires = >=3.8
install_requires =
huggingface_hub
lector @ git+https://github.com/graphext/lector.git
lector @ git+https://github.com/graphext/lector.git@main#egg=lector
pandas
pyarrow
tqdm
Expand All @@ -39,7 +39,6 @@ test=pytest

[options.entry_points]
console_scripts =
lector = lector.cli:CLI

[options.extras_require]
test =
Expand All @@ -48,4 +47,4 @@ test =
dev =
pre-commit
ruff
lector[test]
pyrennial[test]

0 comments on commit 51c0b3b

Please sign in to comment.