[Doc] Update TRN models' README & metafile (open-mmlab#2129)

hukkai · Jan 6, 2023 · 070d13f · 070d13f
1 parent adb3cfd
commit 070d13f
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 30 deletions.
diff --git a/configs/recognition/trn/README.md b/configs/recognition/trn/README.md
@@ -20,24 +20,22 @@ Temporal relational reasoning, the ability to link meaningful transformations of
 
 ### Something-Something V1
 
-| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | gpu_mem(M) |            config            |            ckpt            |            log            |
-| :---------------------: | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :--------: | :--------------------------: | :------------------------: | :-----------------------: |
-|          1x1x8          | height 100 |  8   | ResNet50 | ImageNet |         31.81 / 33.86         |         60.47 / 62.24         |   11037    | [config](/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb_20220815-e13db2e9.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.log) |
+| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) |  testing protocol  | FLOPs  | params |        config         |        ckpt         |         log         |
+| :---------------------: | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :----------------: | :----: | :----: | :-------------------: | :-----------------: | :-----------------: |
+|          1x1x8          |  224x224   |  8   | ResNet50 | ImageNet |         31.60 / 33.65         |         60.15 / 62.22         | 16 clips x 10 crop | 42.94G | 26.64M | [config](/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb_20220815-e13db2e9.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.log) |
 
 ### Something-Something V2
 
-| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) | gpu_mem(M) |            config            |            ckpt            |            log            |
-| :---------------------: | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :--------: | :--------------------------: | :------------------------: | :-----------------------: |
-|          1x1x8          | height 240 |  8   | ResNet50 | ImageNet |         48.54 / 51.53         |         76.53 / 78.60         |   11037    | [config](/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20220815-e01617db.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) |
+| frame sampling strategy | resolution | gpus | backbone | pretrain | top1 acc (efficient/accurate) | top5 acc (efficient/accurate) |  testing protocol  | FLOPs  | params |        config         |        ckpt         |         log         |
+| :---------------------: | :--------: | :--: | :------: | :------: | :---------------------------: | :---------------------------: | :----------------: | :----: | :----: | :-------------------: | :-----------------: | :-----------------: |
+|          1x1x8          |  224x224   |  8   | ResNet50 | ImageNet |         47.65 / 51.20         |         76.27 / 78.42         | 16 clips x 10 crop | 42.94G | 26.64M | [config](/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py) | [ckpt](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20220815-e01617db.pth) | [log](https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log) |
 
-1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
-   According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
-   e.g., lr=0.01 for 4 GPUs x 2 video/gpu and lr=0.08 for 16 GPUs x 4 video/gpu.
-2. There are two kinds of test settings for Something-Something dataset, efficient setting (center crop x 1 clip) and accurate setting (Three crop x 2 clip).
+1. The **gpus** indicates the number of gpus we used to get the checkpoint. If you want to use a different number of gpus or videos per gpu, the best way is to set `--auto-scale-lr` when calling `tools/train.py`, this parameter will auto-scale the learning rate according to the actual batch size and the original batch size.
+2. There are two kinds of test settings for Something-Something dataset, efficient setting (center crop only) and accurate setting (three crop and `twice_sample`).
 3. In the original [repository](https://github.com/zhoubolei/TRN-pytorch), the author augments data with random flipping on something-something dataset, but the augmentation method may be wrong due to the direct actions, such as `push left to right`. So, we replaced `flip` with `flip with label mapping`, and change the testing method `TenCrop`, which has five flipped crops, to `Twice Sample & ThreeCrop`.
 4. We use `ResNet50` instead of `BNInception` as the backbone of TRN. When Training `TRN-ResNet50` on sthv1 dataset in the original repository, we get top1 (top5) accuracy 30.542 (58.627) vs. ours 31.81 (60.47).
 
-For more details on data preparation, you can refer to [sthv1](/tools/data/sthv1/README.md) and [sthv2](/tools/data/sthv2/README.md).
+For more details on data preparation, you can refer to [Something-something V1](/tools/data/sthv1/README.md) and [Something-something V2](/tools/data/sthv2/README.md).
 
 ## Train
 
@@ -51,7 +49,7 @@ Example: train TRN model on sthv1 dataset in a deterministic option with periodi
 
 ```shell
 python tools/train.py configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py \
-    --cfg-options randomness.seed=0 randomness.deterministic=True
+    --seed=0 --deterministic
 ```
 
 For more details, you can refer to the **Training** part in the [Training and Test Tutorial](/docs/en/user_guides/4_train_test.md).

diff --git a/configs/recognition/trn/metafile.yml b/configs/recognition/trn/metafile.yml
@@ -13,20 +13,21 @@ Models:
       Architecture: ResNet50
       Batch Size: 16
       Epochs: 50
-      Parameters: 26641154
+      FLOPs: 42.94G
+      params: 26.64M
       Pretrained: ImageNet
-      Resolution: height 100
+      Resolution: 224x224
       Training Data: SthV1
       Training Resources: 8 GPUs
     Modality: RGB
     Results:
     - Dataset: SthV1
       Task: Action Recognition
       Metrics:
-        Top 1 Accuracy: 33.86
-        Top 1 Accuracy (efficient): 31.81
-        Top 5 Accuracy: 62.24
-        Top 5 Accuracy (efficient): 60.47
+        Top 1 Accuracy: 33.65
+        Top 1 Accuracy (efficient): 31.60
+        Top 5 Accuracy: 62.22
+        Top 5 Accuracy (efficient): 60.15
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb_20220815-e13db2e9.pth
 
@@ -37,8 +38,9 @@ Models:
       Architecture: ResNet50
       Batch Size: 16
       Epochs: 50
-      Parameters: 26641154
-      Pretrained: ImageNet
+      FLOPs: 42.94G
+      params: 26.64M
+      Pretrained: 224x224
       Resolution: height 240
       Training Data: SthV2
       Training Resources: 8 GPUs
@@ -47,9 +49,9 @@ Models:
     - Dataset: SthV2
       Task: Action Recognition
       Metrics:
-        Top 1 Accuracy: 51.53
-        Top 1 Accuracy (efficient): 48.54
-        Top 5 Accuracy: 78.60
-        Top 5 Accuracy (efficient): 76.53
+        Top 1 Accuracy: 51.20
+        Top 1 Accuracy (efficient): 47.65
+        Top 5 Accuracy: 78.42
+        Top 5 Accuracy (efficient): 76.27
     Training Log: https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.log
     Weights: https://download.openmmlab.com/mmaction/v1.0/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb_20220815-e01617db.pth
diff --git a/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py b/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv1-rgb.py
@@ -11,10 +11,12 @@
 ann_file_val = 'data/sthv1/sthv1_val_list_rawframes.txt'
 ann_file_test = 'data/sthv1/sthv1_val_list_rawframes.txt'
 
+file_client_args = dict(io_backend='disk')
+
 sthv1_flip_label_map = {2: 4, 4: 2, 30: 41, 41: 30, 52: 66, 66: 52}
 train_pipeline = [
     dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='Resize', scale=(-1, 256)),
     dict(
         type='MultiScaleCrop',
@@ -35,7 +37,7 @@
         frame_interval=1,
         num_clips=8,
         test_mode=True),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='CenterCrop', crop_size=224),
     dict(type='FormatShape', input_format='NCHW'),
@@ -49,7 +51,7 @@
         num_clips=8,
         twice_sample=True,
         test_mode=True),
-    dict(type='RawFrameDecode'),
+    dict(type='RawFrameDecode', **file_client_args),
     dict(type='Resize', scale=(-1, 256)),
     dict(type='ThreeCrop', crop_size=256),
     dict(type='FormatShape', input_format='NCHW'),

diff --git a/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py b/configs/recognition/trn/trn_imagenet-pretrained-r50_8xb16-1x1x8-50e_sthv2-rgb.py
@@ -11,9 +11,11 @@
 ann_file_val = 'data/sthv2/sthv2_val_list_videos.txt'
 ann_file_test = 'data/sthv2/sthv2_val_list_videos.txt'
 
+file_client_args = dict(io_backend='disk')
+
 sthv2_flip_label_map = {86: 87, 87: 86, 93: 94, 94: 93, 166: 167, 167: 166}
 train_pipeline = [
-    dict(type='DecordInit'),
+    dict(type='DecordInit', **file_client_args),
     dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
     dict(type='DecordDecode'),
     dict(type='Resize', scale=(-1, 256)),
@@ -30,7 +32,7 @@
     dict(type='PackActionInputs')
 ]
 val_pipeline = [
-    dict(type='DecordInit'),
+    dict(type='DecordInit', **file_client_args),
     dict(
         type='SampleFrames',
         clip_len=1,
@@ -44,7 +46,7 @@
     dict(type='PackActionInputs')
 ]
 test_pipeline = [
-    dict(type='DecordInit'),
+    dict(type='DecordInit', **file_client_args),
     dict(
         type='SampleFrames',
         clip_len=1,