Skip to content

Commit

Permalink
update checkpoints
Browse files Browse the repository at this point in the history
  • Loading branch information
benob committed Apr 10, 2024
1 parent 492caff commit cee75bf
Showing 1 changed file with 32 additions and 32 deletions.
64 changes: 32 additions & 32 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,54 +56,54 @@ Models

All models are trained from the 1st 100M tokens from [Common Crawl](http://data.statmt.org/cc-100/)

[checkpoints/it.22000](https://github.com/CoffeePerry/recasepunc/releases/download/v0.1.0/it.22000)
[checkpoints/it.23000](https://github.com/benob/recasepunc/releases/download/v0.4/it.23000)
```
{
"iteration": "22000",
"train_loss": "0.058934884114190934",
"valid_loss": "0.06988634882792658",
"valid_accuracy_case": "0.9575860089785607",
"valid_accuracy_punc": "0.940614491584733",
"valid_fscore": "{0: 0.6431694030761719, 1: 0.6150795817375183, 2: 0.7023577094078064, 3: 0.5514711737632751, 4: 0.21250930428504944}",
"config": "{'seed': 871253, 'lang': 'it', 'flavor': 'dbmdz/bert-base-italian-uncased', 'max_length': 256, 'batch_size': 4, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/it-100M.train.x', 'data/it-100M.train.y', 'data/it-100M.valid.x', 'data/it-100M.valid.y', 'checkpoints/it'], 'pad_token_id': 0, 'cls_token_id': 102, 'cls_token': '[CLS]', 'sep_token_id': 103, 'sep_token': '[SEP]'}"
"iteration": "23000",
"train_loss": "0.015077149430289864",
"valid_loss": "0.021484553813934326",
"valid_accuracy_case": "0.9517227564102564",
"valid_accuracy_punc": "0.9359975961538461",
"valid_fscore": "{0: 0.6016615629196167, 1: 0.6202345490455627, 2: 0.6219512224197388, 3: 0.42424243688583374, 4: 0.08571428805589676}",
"config": "{'seed': 871253, 'lang': 'it', 'flavor': 'dbmdz/bert-base-italian-uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/it-100M.train.x', 'data/it-100M.train.y', 'data/it-100M.valid.x', 'data/it-100M.valid.y', 'checkpoints/it'], 'pad_token_id': 0, 'cls_token_id': 102, 'cls_token': '[CLS]', 'sep_token_id': 103, 'sep_token': '[SEP]'}"
}
```

[checkpoints/zh.24000](https://github.com/benob/recasepunc/releases/download/0.3/zh.24000)
[checkpoints/zh-Hant.17000](https://github.com/benob/recasepunc/releases/download/0.4/zh-Hant.17000)
```
{
"iteration": "24000",
"train_loss": "0.006788245493080467",
"valid_loss": "0.007345725328494341",
"valid_accuracy_case": "0.9963942307692307",
"valid_accuracy_punc": "0.9692508012820513",
"valid_fscore": "{0: 0.7727023363113403, 1: 0.7901785373687744, 2: 0.7293065190315247, 3: 0.7692307829856873, 4: 0.4615384638309479}",
"config": "{'seed': 871253, 'lang': 'zh', 'flavor': 'ckiplab/bert-base-chinese', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/zh-100M.train.x', 'data/zh-100M.train.y', 'data/zh-100M.valid.x', 'data/zh-100M.valid.y', 'checkpoints/zh'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}"
"iteration": "17000",
"train_loss": "0.007012549160048366",
"valid_loss": "0.007463883130978315",
"valid_accuracy_case": "0.9967948717948718",
"valid_accuracy_punc": "0.9682491987179487",
"valid_fscore": "{0: 0.7668336033821106, 1: 0.7813194990158081, 2: 0.7200000286102295, 3: 0.8333333730697632, 4: 0.7272727489471436}",
"config": "{'seed': 871253, 'lang': 'zh-Hant', 'flavor': 'ckiplab/bert-base-chinese', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/zh-Hant-100M.train.x', 'data/zh-Hant-100M.train.y', 'data/zh-Hant-100M.valid.x', 'data/zh-Hant-100M.valid.y', 'checkpoints/zh-Hant'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}"
}
```

[checkpoints/en.23000](https://github.com/benob/recasepunc/releases/download/0.3/en.23000)
[checkpoints/en.22000](https://github.com/benob/recasepunc/releases/download/0.4/en.22000)
```
{
"iteration": "23000",
"train_loss": "0.014598741472698748",
"valid_loss": "0.025432642453756087",
"valid_accuracy_case": "0.9407051282051282",
"valid_accuracy_punc": "0.9401041666666666",
"valid_fscore": "{0: 0.6455026268959045, 1: 0.5925925970077515, 2: 0.7243649959564209, 3: 0.7027027010917664, 4: 0.03921568766236305}",
"config": "{'seed': 871253, 'lang': 'en', 'flavor': 'bert-base-uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/en-100M.train.x', 'data/en-100M.train.y', 'data/en-100M.valid.x', 'data/en-100M.valid.y', 'checkpoints/en'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}"
"iteration": "22000",
"train_loss": "0.01467611983884126",
"valid_loss": "0.02559371789296468",
"valid_accuracy_case": "0.9393028846153846",
"valid_accuracy_punc": "0.9404046474358975",
"valid_fscore": "{0: 0.6431096196174622, 1: 0.603951096534729, 2: 0.7078340649604797, 3: 0.6865671277046204, 4: 0}",
"config": "{'seed': 871253, 'lang': 'en', 'flavor': 'bert-base-uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/en-100M.train.x', 'data/en-100M.train.y', 'data/en-100M.valid.x', 'data/en-100M.valid.y', 'checkpoints/en'], 'pad_token_id': 0, 'cls_token_id': 101, 'cls_token': '[CLS]', 'sep_token_id': 102, 'sep_token': '[SEP]'}"
}
```

[checkpoints/fr.22000](https://github.com/benob/recasepunc/releases/download/0.3/fr.22000)
[checkpoints/fr.24000](https://github.com/benob/recasepunc/releases/download/0.4/fr.24000)
```
{
"iteration": "22000",
"train_loss": "0.02052250287961215",
"valid_loss": "0.009240646392871171",
"valid_accuracy_case": "0.9881810897435898",
"valid_accuracy_punc": "0.9683493589743589",
"valid_fscore": "{0: 0.802524745464325, 1: 0.7892595529556274, 2: 0.8360477685928345, 3: 0.8717948198318481, 4: 0.2068965584039688}",
"iteration": "24000",
"train_loss": "0.015482447233051061",
"valid_loss": "0.006200919071069131",
"valid_accuracy_case": "1.0",
"valid_accuracy_punc": "0.9691506410256411",
"valid_fscore": "{0: 0.8114132881164551, 1: 0.7968379855155945, 2: 0.8446389436721802, 3: 0.8421052694320679, 4: 0.3076923191547394}",
"config": "{'seed': 871253, 'lang': 'fr', 'flavor': 'flaubert/flaubert_base_uncased', 'max_length': 256, 'batch_size': 16, 'updates': 24000, 'period': 1000, 'lr': 1e-05, 'dab_rate': 0.1, 'device': device(type='cuda'), 'debug': False, 'action': 'train', 'action_args': ['data/fr-100M.train.x', 'data/fr-100M.train.y', 'data/fr-100M.valid.x', 'data/fr-100M.valid.y', 'checkpoints/fr'], 'pad_token_id': 2, 'cls_token_id': 0, 'cls_token': '<s>', 'sep_token_id': 1, 'sep_token': '</s>'}"
}
```
Expand Down Expand Up @@ -139,9 +139,9 @@ python recasepunc.py eval test.x test.y checkpoint/path.iteration
Two scripts used to create the models are given as example of how to train for a new language:
* `./prepare.sh <lang>` for downloading data, creating sets, and preprocessing
* `./train.sh <lang>` for trainging the model

Both assume the availability of a `env.sh` script for loading the environment and setting up stuff.
`requirements.freeze.txt` contains the package versions used for training.

You will need to modify recasepunc.py and set the BERT flavior for the new language and check that the tokenizer correctly handles punctuation. For French, we had to patch the tokenizer to keep input/punctuation synchronized.

Notes
Expand Down

0 comments on commit cee75bf

Please sign in to comment.