Skip to content

Commit

Permalink
refactor and improvement
Browse files Browse the repository at this point in the history
  • Loading branch information
hetong007 committed Mar 16, 2018
1 parent a5d67b9 commit c5cfd20
Show file tree
Hide file tree
Showing 6 changed files with 179 additions and 216 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
data/*
*.swp
.ipynb_checkpoints
submission

141 changes: 50 additions & 91 deletions FashionAI-Attributes-Skirt.ipynb

Large diffs are not rendered by default.

24 changes: 12 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,23 @@

This is the repo for [MXNet/Gluon](http://mxnet.incubator.apache.org/) benchmark scripts for the [FashionAI](https://tianchi.aliyun.com/competition/information.htm?spm=5176.100067.5678.2.505c3a26Oet3cf&raceId=231649) competition by Alibaba.

The generated submission will have mAP around 0.95 and Basic Precision around 0.84 on the board.

1. Download and untar the data files into `data/` folder, the structure should look like
```
Gluon-FashionAI-Attributes
├── data/
│   ├── base/
│   ├── rank/
│   ── web/
── data.py
├── benchmark.sh
├── data
│   ├── base
│   ── rank
│   └── web
├── FashionAI-Attributes-Skirt.ipynb
├── main.py
└── README.md
├── prepare_data.py
├── README.md
└── train_task.py
```
2. Execute `python data.py` to prepare the `train_valid` folder for train and validation split.
3. Execute `python main.py` to train and predict for all eight tasks.
4. Submit `submission.csv` via the competition portal.
3. Execute `bash benchmark.sh` to prepare data, train and predict for all eight tasks.
4. Compress and submit `submission/submission.csv` via the competition portal.

The script was tested on a [p3.8xlarge](https://aws.amazon.com/ec2/instance-types/p3/) EC2 instance from AWS. It costs around two and half hours.

The generated submission will have mAP around 0.95 and Basic Precision around 0.84 on the board.

14 changes: 14 additions & 0 deletions benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
python prepare_data.py

python train_task.py --task skirt_length_labels --model resnet50_v2 --num-gpus 4 -j 32 --epochs 0
python train_task.py --task collar_design_labels --model resnet50_v2 --num-gpus 4 -j 32 --epochs 0
python train_task.py --task lapel_design_labels --model resnet50_v2 --num-gpus 4 -j 32 --epochs 0
python train_task.py --task neckline_design_labels --model resnet50_v2 --num-gpus 4 -j 32 --epochs 0
python train_task.py --task coat_length_labels --model resnet50_v2 --num-gpus 4 -j 32 --epochs 0
python train_task.py --task neck_design_labels --model resnet50_v2 --num-gpus 4 -j 32 --epochs 0
python train_task.py --task pant_length_labels --model resnet50_v2 --num-gpus 4 -j 32 --epochs 0
python train_task.py --task sleeve_length_labels --model resnet50_v2 --num-gpus 4 -j 32 --epochs 0

cd submission
cat collar_design_labels.csv neckline_design_labels.csv skirt_length_labels.csv sleeve_length_labels.csv neck_design_labels.csv coat_length_labels.csv lapel_design_labels.csv pant_length_labels.csv > submission.csv

1 change: 1 addition & 0 deletions data.py → prepare_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def mkdir_if_not_exist(path):
label_dict[task].append((path, label))

mkdir_if_not_exist(['data/train_valid'])
mkdir_if_not_exist(['submission'])

for task, path_label in label_dict.items():
mkdir_if_not_exist(['data/train_valid', task])
Expand Down
214 changes: 101 additions & 113 deletions main.py → train_task.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,41 @@
import mxnet as mx
import numpy as np
import os, time, logging, math
import os, time, logging, math, argparse

from mxnet import gluon, image, init, nd
from mxnet import autograd as ag
from mxnet.gluon import nn
from mxnet.gluon.model_zoo import vision as models

task_list = {
'collar_design_labels': 5,
'skirt_length_labels': 6,
'lapel_design_labels': 5,
'neckline_design_labels': 10,
'coat_length_labels': 8,
'neck_design_labels': 5,
'pant_length_labels': 6,
'sleeve_length_labels': 9}

momentum = 0.9
wd = 1e-4
epochs = 40

batch_size = 64
lr = 1e-3
num_gpu = 4
ctx = [mx.gpu(i) for i in range(num_gpu)]
batch_size = batch_size*num_gpu

logging.basicConfig(level=logging.INFO,
handlers = [
logging.StreamHandler(),
logging.FileHandler('training.log')
])

def get_ap(labels, outputs):
def parse_args():
parser = argparse.ArgumentParser(description='Gluon for FashionAI Competition',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--task', required=True, type=str,
help='name of the classification task')
parser.add_argument('--model', required=True, type=str,
help='name of the pretrained model from model zoo.')
parser.add_argument('-j', '--workers', dest='num_workers', default=4, type=int,
help='number of preprocessing workers')
parser.add_argument('--num-gpus', default=0, type=int,
help='number of gpus to use, 0 indicates cpu only')
parser.add_argument('--epochs', default=40, type=int,
help='number of training epochs')
parser.add_argument('-b', '--batch-size', default=64, type=int,
help='mini-batch size')
parser.add_argument('--lr', '--learning-rate', default=0.001, type=float,
help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float,
help='momentum')
parser.add_argument('--weight-decay', '--wd', dest='wd', default=1e-4, type=float,
help='weight decay (default: 1e-4)')
parser.add_argument('--lr-factor', default=0.75, type=float,
help='learning rate decay ratio')
parser.add_argument('--lr-steps', default='10,20,30', type=str,
help='list of learning rate decay epochs as in str')
args = parser.parse_args()
return args

def calculate_ap(labels, outputs):
cnt = 0
ap = 0.
for label, output in zip(labels, outputs):
Expand Down Expand Up @@ -79,22 +81,14 @@ def transform_train(data, label):
im = nd.transpose(im, (2,0,1))
return (im, nd.array([label]).asscalar())

def transform_val_normal(data, label):
def transform_val(data, label):
im = data.astype('float32') / 255
im = image.resize_short(im, 256)
im, _ = image.center_crop(im, (224, 224))
im = nd.transpose(im, (2,0,1))
im = mx.nd.image.normalize(im, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
return (im, nd.array([label]).asscalar())

def transform_val_tencrop(im, label):
im = im.astype('float32') / 255
im = image.resize_short(im, 256)
im = nd.transpose(im, (2,0,1))
im = mx.nd.image.normalize(im, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
im = ten_crop(im, (224, 224))
return (im, nd.array([label]).asscalar())

def transform_predict(im):
im = im.astype('float32') / 255
im = image.resize_short(im, 256)
Expand All @@ -109,7 +103,7 @@ def progressbar(i, n, bar_len=40):
prog_bar = '=' * filled_len + '-' * (bar_len - filled_len)
print('[%s] %s%s' % (prog_bar, percents, '%'), end = '\r')

def test_normal(net, val_data, ctx):
def validate(net, val_data, ctx):
metric = mx.metric.Accuracy()
L = gluon.loss.SoftmaxCrossEntropyLoss()
AP = 0.
Expand All @@ -122,82 +116,54 @@ def test_normal(net, val_data, ctx):
metric.update(label, outputs)
loss = [L(yhat, y) for yhat, y in zip(outputs, label)]
val_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)
ap, cnt = get_ap(label, outputs)
ap, cnt = calculate_ap(label, outputs)
AP += ap
AP_cnt += cnt
_, val_acc = metric.get()
return ((val_acc, AP / AP_cnt, val_loss / len(val_data)))

def test_tencrop(net, val_data, ctx):
metric = mx.metric.Accuracy()
AP = 0.
AP_cnt = 0
for i, batch in enumerate(val_data):
data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0, even_split=False)
label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0, even_split=False)
outputs = []
for d in data:
n = d.shape[0]
outs = []
for i in range(n):
out = net(d[i])
out = nd.SoftmaxActivation(out).mean(axis=0)
outs.append(out.asnumpy().tolist())
outputs.append(nd.array(outs))
metric.update(label, outputs)
ap, cnt = get_ap(label, outputs)
AP += ap
AP_cnt += cnt
_, val_acc = metric.get()
return ((val_acc, AP / AP_cnt))

def train(task, task_num_class):
def train():
logging.info('Start Training for Task: %s\n' % (task))

# Initialize the net with pretrained model
pretrained_net = models.resnet50_v2(pretrained=True)
pretrained_net = gluon.model_zoo.vision.get_model(model_name, pretrained=True)

finetune_net = models.resnet50_v2(classes=task_num_class)
model_name = 'resnet50_v2'
finetune_net = gluon.model_zoo.vision.get_model(model_name, classes=task_num_class)
finetune_net.features = pretrained_net.features
finetune_net.output.initialize(init.Xavier(), ctx = ctx)
finetune_net.collect_params().reset_ctx(ctx)
# for v in finetune_net.collect_params().values():
# if 'dense' in v.name:
# setattr(v, 'lr_mult', 10)
finetune_net.hybridize()

# Define DataLoader
train_data = gluon.data.DataLoader(
gluon.data.vision.ImageFolderDataset(
os.path.join('data/train_valid', task, 'train'),
transform=transform_train),
batch_size=batch_size, shuffle=True, num_workers=32, last_batch='discard')
batch_size=batch_size, shuffle=True, num_workers=num_workers, last_batch='discard')

val_data = gluon.data.DataLoader(
gluon.data.vision.ImageFolderDataset(
os.path.join('data/train_valid', task, 'val'),
transform=transform_val_normal),
batch_size=batch_size, shuffle=False, num_workers = 32)
transform=transform_val),
batch_size=batch_size, shuffle=False, num_workers = num_workers)

# Define Trainer
trainer = gluon.Trainer(finetune_net.collect_params(), 'sgd', {
'learning_rate': lr, 'momentum': momentum,'wd': wd})
'learning_rate': lr, 'momentum': momentum, 'wd': wd})
metric = mx.metric.Accuracy()
L = gluon.loss.SoftmaxCrossEntropyLoss()
iteration = 0

lr_counter = 0
num_batch = len(train_data)

# Start Training
for epoch in range(epochs):
if (epoch+1) % 10 == 0:
trainer.set_learning_rate(trainer.learning_rate*0.75)
if epoch == lr_steps[lr_counter]:
trainer.set_learning_rate(trainer.learning_rate*lr_factor)
lr_counter += 1

tic = time.time()
train_loss = 0
num_batch = len(train_data)
metric.reset()

# if epoch == 40:
# trainer.set_learning_rate(lr*0.1)
AP = 0.
AP_cnt = 0

Expand All @@ -214,59 +180,39 @@ def train(task, task_num_class):
train_loss += sum([l.mean().asscalar() for l in loss]) / len(loss)

metric.update(label, outputs)
ap, cnt = get_ap(label, outputs)
ap, cnt = calculate_ap(label, outputs)
AP += ap
AP_cnt += cnt

iteration += 1
progressbar(i, num_batch-1)

train_map = AP / AP_cnt
_, train_acc = metric.get()
train_loss /= num_batch
if val_data is None:
logging.info('[Epoch %d] train-acc: %.3f, train-map: %.3f, train-loss: %.3f, time: %.1f' %
(epoch, train_acc, train_map, train_loss, time.time() - tic))
else:
# val_acc, val_map= test(finetune_net, val_data, ctx)
val_acc, val_map, val_loss = test_normal(finetune_net, val_data, ctx)
logging.info('[Epoch %d] Train-acc: %.3f, mAP: %.3f, loss: %.3f | Val-acc: %.3f, mAP: %.3f, loss: %.3f | time: %.1f' %

val_acc, val_map, val_loss = validate(finetune_net, val_data, ctx)

logging.info('[Epoch %d] Train-acc: %.3f, mAP: %.3f, loss: %.3f | Val-acc: %.3f, mAP: %.3f, loss: %.3f | time: %.1f' %
(epoch, train_acc, train_map, train_loss, val_acc, val_map, val_loss, time.time() - tic))

logging.info('\n')
return (finetune_net)

if __name__ == '__main__':
net_dict = {}
for task, task_num_class in task_list.items():
net_dict[task] = train(task, task_num_class)

logging.info('Training Finished. Starting Validation.\n')
# Validate All Network is Working

for task in task_list.keys():
val_data = gluon.data.DataLoader(
gluon.data.vision.ImageFolderDataset(
os.path.join('data/train_valid', task, 'val'),
transform=transform_val_normal),
batch_size=batch_size, shuffle=False, num_workers = 32)
val_acc, val_map, val_loss = test_normal(net_dict[task], val_data, ctx)
logging.info('[Validation for %s] Val-acc: %.3f, mAP: %.3f, loss: %.3f' %
(task, val_acc, val_map, val_loss))

logging.info('Validation Finished. Starting Prediction.\n')
f_out = open('submission.csv', 'w')
def predict(task):
logging.info('Training Finished. Starting Prediction.\n')
f_out = open('submission/%s.csv'%(task), 'w')
with open('data/rank/Tests/question.csv', 'r') as f_in:
lines = f_in.readlines()
tokens = [l.rstrip().split(',') for l in lines]
n = len(tokens)
task_tokens = [t for t in tokens if t[1] == task]
n = len(task_tokens)
cnt = 0
for path, task, _ in tokens:
for path, task, _ in task_tokens:
img_path = os.path.join('data/rank', path)
with open(img_path, 'rb') as f:
img = image.imdecode(f.read())
data = transform_predict(img)
out = net_dict[task](data.as_in_context(mx.gpu(0)))
out = net(data.as_in_context(mx.gpu(0)))
out = nd.SoftmaxActivation(out).mean(axis=0)

pred_out = ';'.join(["%.8f"%(o) for o in out.asnumpy().tolist()])
Expand All @@ -276,3 +222,45 @@ def train(task, task_num_class):
progressbar(cnt, n)
f_out.close()

# Preparation
args = parse_args()

task_list = {
'collar_design_labels': 5,
'skirt_length_labels': 6,
'lapel_design_labels': 5,
'neckline_design_labels': 10,
'coat_length_labels': 8,
'neck_design_labels': 5,
'pant_length_labels': 6,
'sleeve_length_labels': 9
}
task = args.task
task_num_class = task_list[task]

model_name = args.model

epochs = args.epochs
lr = args.lr
batch_size = args.batch_size
momentum = args.momentum
wd = args.wd

lr_factor = args.lr_factor
lr_steps = [int(s) for s in args.lr_steps.split(',')] + [np.inf]

num_gpus = args.num_gpus
num_workers = args.num_workers
ctx = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
batch_size = batch_size * max(num_gpus, 1)

logging.basicConfig(level=logging.INFO,
handlers = [
logging.StreamHandler(),
logging.FileHandler('training.log')
])

if __name__ == "__main__":
net = train()
predict(task)

0 comments on commit c5cfd20

Please sign in to comment.