diff --git a/README.md b/README.md
index 6306e55ec866..c6b638003c62 100755
--- a/README.md
+++ b/README.md
@@ -41,9 +41,13 @@ $ pip install -U -r requirements.txt
 ## Tutorials
 
 * [Notebook](https://github.com/ultralytics/yolov5/blob/master/tutorial.ipynb) <a href="https://colab.research.google.com/github/ultralytics/yolov5/blob/master/tutorial.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"></a>
+* [Kaggle](https://www.kaggle.com/ultralytics/yolov5-tutorial)
 * [Train Custom Data](https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data)
-* [Google Cloud Quickstart Guide](https://github.com/ultralytics/yolov5/wiki/GCP-Quickstart)
-* [Docker Quickstart Guide](https://github.com/ultralytics/yolov5/wiki/Docker-Quickstart) ![Docker Pulls](https://img.shields.io/docker/pulls/ultralytics/yolov5?logo=docker)
+* [PyTorch Hub](https://github.com/ultralytics/yolov5/issues/36)
+* [ONNX and TorchScript Export](https://github.com/ultralytics/yolov5/issues/251)
+* [Test-Time Augmentation (TTA)](https://github.com/ultralytics/yolov5/issues/303)
+* [Google Cloud Quickstart](https://github.com/ultralytics/yolov5/wiki/GCP-Quickstart)
+* [Docker Quickstart](https://github.com/ultralytics/yolov5/wiki/Docker-Quickstart) ![Docker Pulls](https://img.shields.io/docker/pulls/ultralytics/yolov5?logo=docker)
 
 
 ## Inference
diff --git a/data/get_coco2017.sh b/data/get_coco2017.sh
index fed57473d5c0..03b2c7e89301 100755
--- a/data/get_coco2017.sh
+++ b/data/get_coco2017.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
-# Zip coco folder
-# zip -r coco.zip coco
-# tar -czvf coco.tar.gz coco
+# COCO 2017 dataset http://cocodataset.org
+# Download command: bash yolov5/data/get_coco2017.sh
+# Train command: python train.py --data ./data/coco.yaml
+# Dataset should be placed next to yolov5 folder:
+#   /parent_folder
+#     /coco
+#     /yolov5
 
 # Download labels from Google Drive, accepting presented query
 filename="coco2017labels.zip"
diff --git a/data/get_voc.sh b/data/get_voc.sh
new file mode 100644
index 000000000000..b7e66d003133
--- /dev/null
+++ b/data/get_voc.sh
@@ -0,0 +1,214 @@
+# PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC/
+# Download command: bash ./data/get_voc.sh
+# Train command: python train.py --data voc.yaml
+# Dataset should be placed next to yolov5 folder:
+#   /parent_folder
+#     /VOC
+#     /yolov5
+
+start=`date +%s`
+
+# handle optional download dir
+if [ -z "$1" ]
+  then
+    # navigate to ~/tmp
+    echo "navigating to ../tmp/ ..."
+    mkdir -p ../tmp
+    cd ../tmp/
+  else
+    # check if is valid directory
+    if [ ! -d $1 ]; then
+        echo $1 "is not a valid directory"
+        exit 0
+    fi
+    echo "navigating to" $1 "..."
+    cd $1
+fi
+
+echo "Downloading VOC2007 trainval ..."
+# Download the data.
+curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
+echo "Downloading VOC2007 test data ..."
+curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+echo "Done downloading."
+
+# Extract data
+echo "Extracting trainval ..."
+tar -xf VOCtrainval_06-Nov-2007.tar
+echo "Extracting test ..."
+tar -xf VOCtest_06-Nov-2007.tar
+echo "removing tars ..."
+rm VOCtrainval_06-Nov-2007.tar
+rm VOCtest_06-Nov-2007.tar
+
+end=`date +%s`
+runtime=$((end-start))
+
+echo "Completed in" $runtime "seconds"
+
+start=`date +%s`
+
+# handle optional download dir
+if [ -z "$1" ]
+  then
+    # navigate to ~/tmp
+    echo "navigating to ../tmp/ ..."
+    mkdir -p ../tmp
+    cd ../tmp/
+  else
+    # check if is valid directory
+    if [ ! -d $1 ]; then
+        echo $1 "is not a valid directory"
+        exit 0
+    fi
+    echo "navigating to" $1 "..."
+    cd $1
+fi
+
+echo "Downloading VOC2012 trainval ..."
+# Download the data.
+curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
+echo "Done downloading."
+
+
+# Extract data
+echo "Extracting trainval ..."
+tar -xf VOCtrainval_11-May-2012.tar
+echo "removing tar ..."
+rm VOCtrainval_11-May-2012.tar
+
+end=`date +%s`
+runtime=$((end-start))
+
+echo "Completed in" $runtime "seconds"
+
+cd ../tmp
+echo "Spliting dataset..."
+python3 - "$@" <<END
+import xml.etree.ElementTree as ET
+import pickle
+import os
+from os import listdir, getcwd
+from os.path import join
+
+sets=[('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test')]
+
+classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
+
+
+def convert(size, box):
+    dw = 1./(size[0])
+    dh = 1./(size[1])
+    x = (box[0] + box[1])/2.0 - 1
+    y = (box[2] + box[3])/2.0 - 1
+    w = box[1] - box[0]
+    h = box[3] - box[2]
+    x = x*dw
+    w = w*dw
+    y = y*dh
+    h = h*dh
+    return (x,y,w,h)
+
+def convert_annotation(year, image_id):
+    in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id))
+    out_file = open('VOCdevkit/VOC%s/labels/%s.txt'%(year, image_id), 'w')
+    tree=ET.parse(in_file)
+    root = tree.getroot()
+    size = root.find('size')
+    w = int(size.find('width').text)
+    h = int(size.find('height').text)
+
+    for obj in root.iter('object'):
+        difficult = obj.find('difficult').text
+        cls = obj.find('name').text
+        if cls not in classes or int(difficult)==1:
+            continue
+        cls_id = classes.index(cls)
+        xmlbox = obj.find('bndbox')
+        b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
+        bb = convert((w,h), b)
+        out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
+
+wd = getcwd()
+
+for year, image_set in sets:
+    if not os.path.exists('VOCdevkit/VOC%s/labels/'%(year)):
+        os.makedirs('VOCdevkit/VOC%s/labels/'%(year))
+    image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set)).read().strip().split()
+    list_file = open('%s_%s.txt'%(year, image_set), 'w')
+    for image_id in image_ids:
+        list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg\n'%(wd, year, image_id))
+        convert_annotation(year, image_id)
+    list_file.close()
+
+END
+
+cat 2007_train.txt 2007_val.txt 2012_train.txt 2012_val.txt > train.txt
+cat 2007_train.txt 2007_val.txt 2007_test.txt 2012_train.txt 2012_val.txt > train.all.txt
+
+python3 - "$@" <<END
+
+import shutil
+import os
+os.system('mkdir ../VOC/')
+os.system('mkdir ../VOC/images')
+os.system('mkdir ../VOC/images/train')
+os.system('mkdir ../VOC/images/val')
+
+os.system('mkdir ../VOC/labels')
+os.system('mkdir ../VOC/labels/train')
+os.system('mkdir ../VOC/labels/val')
+
+import os
+print(os.path.exists('../tmp/train.txt'))
+f = open('../tmp/train.txt', 'r')
+lines = f.readlines()
+
+for line in lines:
+    #print(line.split('/')[-1][:-1])
+    line = "/".join(line.split('/')[2:])
+    #print(line)
+    if (os.path.exists("../" + line[:-1])):
+        os.system("cp ../"+ line[:-1] + " ../VOC/images/train")
+        
+print(os.path.exists('../tmp/train.txt'))
+f = open('../tmp/train.txt', 'r')
+lines = f.readlines()
+
+for line in lines:
+    #print(line.split('/')[-1][:-1])
+    line = "/".join(line.split('/')[2:])
+    line = line.replace('JPEGImages', 'labels')
+    line = line.replace('jpg', 'txt')
+    #print(line)
+    if (os.path.exists("../" + line[:-1])):
+        os.system("cp ../"+ line[:-1] + " ../VOC/labels/train")
+
+print(os.path.exists('../tmp/2007_test.txt'))
+f = open('../tmp/2007_test.txt', 'r')
+lines = f.readlines()
+
+for line in lines:
+    #print(line.split('/')[-1][:-1])
+    line = "/".join(line.split('/')[2:])
+    
+    if (os.path.exists("../" + line[:-1])):
+        os.system("cp ../"+ line[:-1] + " ../VOC/images/val")
+
+print(os.path.exists('../tmp/2007_test.txt'))
+f = open('../tmp/2007_test.txt', 'r')
+lines = f.readlines()
+
+for line in lines:
+    #print(line.split('/')[-1][:-1])
+    line = "/".join(line.split('/')[2:])
+    line = line.replace('JPEGImages', 'labels')
+    line = line.replace('jpg', 'txt')
+    #print(line)
+    if (os.path.exists("../" + line[:-1])):
+        os.system("cp ../"+ line[:-1] + " ../VOC/labels/val")
+
+END
+
+rm -rf ../tmp  # remove temporary directory
+echo "VOC download done."
diff --git a/data/voc.yaml b/data/voc.yaml
new file mode 100644
index 000000000000..efe22308ad47
--- /dev/null
+++ b/data/voc.yaml
@@ -0,0 +1,18 @@
+# PASCAL VOC dataset http://host.robots.ox.ac.uk/pascal/VOC/
+# Download command: bash ./data/get_voc.sh
+# Train command: python train.py --data voc.yaml
+# Dataset should be placed next to yolov5 folder:
+#   /parent_folder
+#     /VOC
+#     /yolov5
+
+# train and val datasets (image directory or *.txt file with image paths)
+train: ../VOC/images/train/
+val: ../VOC/images/val/
+
+# number of classes
+nc: 20
+
+# class names
+names: ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
+        'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']
diff --git a/detect.py b/detect.py
index 2650c202d49d..d02f0a922817 100644
--- a/detect.py
+++ b/detect.py
@@ -21,11 +21,8 @@ def detect(save_img=False):
 
     # Load model
     google_utils.attempt_download(weights)
-    model = torch.load(weights, map_location=device)['model'].float()  # load to FP32
-    # torch.save(torch.load(weights, map_location=device), weights)  # update model if SourceChangeWarning
-    # model.fuse()
-    model.to(device).eval()
-    imgsz = check_img_size(imgsz, s=model.model[-1].stride.max())  # check img_size
+    model = torch.load(weights, map_location=device)['model'].float().eval()  # load FP32 model
+    imgsz = check_img_size(imgsz, s=model.stride.max())  # check img_size
     if half:
         model.half()  # to FP16
 
@@ -123,10 +120,11 @@ def detect(save_img=False):
                         if isinstance(vid_writer, cv2.VideoWriter):
                             vid_writer.release()  # release previous video writer
 
+                        fourcc = 'mp4v'  # output video codec
                         fps = vid_cap.get(cv2.CAP_PROP_FPS)
                         w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                         h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-                        vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*opt.fourcc), fps, (w, h))
+                        vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h))
                     vid_writer.write(im0)
 
     if save_txt or save_img:
@@ -145,20 +143,20 @@ def detect(save_img=False):
     parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
     parser.add_argument('--conf-thres', type=float, default=0.4, help='object confidence threshold')
     parser.add_argument('--iou-thres', type=float, default=0.5, help='IOU threshold for NMS')
-    parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec (verify ffmpeg support)')
     parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
     parser.add_argument('--view-img', action='store_true', help='display results')
     parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
     parser.add_argument('--classes', nargs='+', type=int, help='filter by class')
     parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
     parser.add_argument('--augment', action='store_true', help='augmented inference')
+    parser.add_argument('--update', action='store_true', help='update all models')
     opt = parser.parse_args()
     print(opt)
 
     with torch.no_grad():
-        detect()
-
-        # # Update all models
-        # for opt.weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt', 'yolov3-spp.pt']:
-        #     detect()
-        #     create_pretrained(opt.weights, opt.weights)
+        if opt.update:  # update all models (to fix SourceChangeWarning)
+            for opt.weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt', 'yolov3-spp.pt']:
+                detect()
+                create_pretrained(opt.weights, opt.weights)
+        else:
+            detect()
diff --git a/models/experimental.py b/models/experimental.py
index cff9d141446d..146a61b67a44 100644
--- a/models/experimental.py
+++ b/models/experimental.py
@@ -4,12 +4,13 @@
 
 
 class CrossConv(nn.Module):
-    # Cross Convolution
-    def __init__(self, c1, c2, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, shortcut, groups, expansion
+    # Cross Convolution Downsample
+    def __init__(self, c1, c2, k=3, s=1, g=1, e=1.0, shortcut=False):
+        # ch_in, ch_out, kernel, stride, groups, expansion, shortcut
         super(CrossConv, self).__init__()
         c_ = int(c2 * e)  # hidden channels
-        self.cv1 = Conv(c1, c_, (1, 3), 1)
-        self.cv2 = Conv(c_, c2, (3, 1), 1, g=g)
+        self.cv1 = Conv(c1, c_, (1, k), (1, s))
+        self.cv2 = Conv(c_, c2, (k, 1), (s, 1), g=g)
         self.add = shortcut and c1 == c2
 
     def forward(self, x):
@@ -27,7 +28,7 @@ def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, nu
         self.cv4 = Conv(2 * c_, c2, 1, 1)
         self.bn = nn.BatchNorm2d(2 * c_)  # applied to cat(cv2, cv3)
         self.act = nn.LeakyReLU(0.1, inplace=True)
-        self.m = nn.Sequential(*[CrossConv(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
+        self.m = nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)])
 
     def forward(self, x):
         y1 = self.cv3(self.m(self.cv1(x)))
@@ -84,17 +85,6 @@ def forward(self, x):
         return self.conv(x) + self.shortcut(x)
 
 
-class ConvPlus(nn.Module):
-    # Plus-shaped convolution
-    def __init__(self, c1, c2, k=3, s=1, g=1, bias=True):  # ch_in, ch_out, kernel, stride, groups
-        super(ConvPlus, self).__init__()
-        self.cv1 = nn.Conv2d(c1, c2, (k, 1), s, (k // 2, 0), groups=g, bias=bias)
-        self.cv2 = nn.Conv2d(c1, c2, (1, k), s, (0, k // 2), groups=g, bias=bias)
-
-    def forward(self, x):
-        return self.cv1(x) + self.cv2(x)
-
-
 class MixConv2d(nn.Module):
     # Mixed Depthwise Conv https://arxiv.org/abs/1907.09595
     def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True):
@@ -117,3 +107,15 @@ def __init__(self, c1, c2, k=(1, 3), s=1, equal_ch=True):
 
     def forward(self, x):
         return x + self.act(self.bn(torch.cat([m(x) for m in self.m], 1)))
+
+
+class Ensemble(nn.ModuleList):
+    # Ensemble of models
+    def __init__(self):
+        super(Ensemble, self).__init__()
+
+    def forward(self, x, augment=False):
+        y = []
+        for module in self:
+            y.append(module(x, augment)[0])
+        return torch.cat(y, 1), None  # ensembled inference output, train output
diff --git a/models/export.py b/models/export.py
index bb310f3f89a0..c11c0a391197 100644
--- a/models/export.py
+++ b/models/export.py
@@ -26,30 +26,47 @@
     model = torch.load(opt.weights, map_location=torch.device('cpu'))['model'].float()
     model.eval()
     model.model[-1].export = True  # set Detect() layer export=True
-    _ = model(img)  # dry run
+    y = model(img)  # dry run
 
     # TorchScript export
     try:
+        print('\nStarting TorchScript export with torch %s...' % torch.__version__)
         f = opt.weights.replace('.pt', '.torchscript')  # filename
         ts = torch.jit.trace(model, img)
         ts.save(f)
         print('TorchScript export success, saved as %s' % f)
     except Exception as e:
-        print('TorchScript export failed: %s' % e)
+        print('TorchScript export failure: %s' % e)
 
     # ONNX export
     try:
         import onnx
 
+        print('\nStarting ONNX export with onnx %s...' % onnx.__version__)
         f = opt.weights.replace('.pt', '.onnx')  # filename
         model.fuse()  # only for ONNX
         torch.onnx.export(model, img, f, verbose=False, opset_version=12, input_names=['images'],
-                          output_names=['output'])  # output_names=['classes', 'boxes']
+                          output_names=['classes', 'boxes'] if y is None else ['output'])
 
         # Checks
         onnx_model = onnx.load(f)  # load onnx model
         onnx.checker.check_model(onnx_model)  # check onnx model
-        print(onnx.helper.printable_graph(onnx_model.graph))  # print a human readable representation of the graph
-        print('ONNX export success, saved as %s\nView with https://github.com/lutzroeder/netron' % f)
+        print(onnx.helper.printable_graph(onnx_model.graph))  # print a human readable model
+        print('ONNX export success, saved as %s' % f)
     except Exception as e:
-        print('ONNX export failed: %s' % e)
+        print('ONNX export failure: %s' % e)
+
+    # CoreML export
+    try:
+        import coremltools as ct
+
+        print('\nStarting CoreML export with coremltools %s...' % ct.__version__)
+        model = ct.convert(ts, inputs=[ct.ImageType(name='images', shape=img.shape)])  # convert
+        f = opt.weights.replace('.pt', '.mlmodel')  # filename
+        model.save(f)
+        print('CoreML export success, saved as %s' % f)
+    except Exception as e:
+        print('CoreML export failure: %s' % e)
+
+    # Finish
+    print('\nExport complete. Visualize with https://github.com/lutzroeder/netron.')
diff --git a/models/yolo.py b/models/yolo.py
index c9e6c493d2a2..3fd87a336c68 100644
--- a/models/yolo.py
+++ b/models/yolo.py
@@ -48,21 +48,27 @@ def __init__(self, model_cfg='yolov5s.yaml', ch=3, nc=None):  # model, input cha
         if type(model_cfg) is dict:
             self.md = model_cfg  # model dict
         else:  # is *.yaml
+            import yaml  # for torch hub
             with open(model_cfg) as f:
                 self.md = yaml.load(f, Loader=yaml.FullLoader)  # model dict
 
         # Define model
-        if nc:
+        if nc and nc != self.md['nc']:
+            print('Overriding %s nc=%g with nc=%g' % (model_cfg, self.md['nc'], nc))
             self.md['nc'] = nc  # override yaml value
         self.model, self.save = parse_model(self.md, ch=[ch])  # model, savelist, ch_out
         # print([x.shape for x in self.forward(torch.zeros(1, ch, 64, 64))])
 
         # Build strides, anchors
         m = self.model[-1]  # Detect()
-        m.stride = torch.tensor([128 / x.shape[-2] for x in self.forward(torch.zeros(1, ch, 128, 128))])  # forward
-        m.anchors /= m.stride.view(-1, 1, 1)
-        check_anchor_order(m)
-        self.stride = m.stride
+        if isinstance(m, Detect):
+            s = 128  # 2x min stride
+            m.stride = torch.tensor([s / x.shape[-2] for x in self.forward(torch.zeros(1, ch, s, s))])  # forward
+            m.anchors /= m.stride.view(-1, 1, 1)
+            check_anchor_order(m)
+            self.stride = m.stride
+            self._initialize_biases()  # only run once
+            # print('Strides: %s' % m.stride.tolist())
 
         # Init weights, biases
         torch_utils.initialize_weights(self)
@@ -136,17 +142,17 @@ def _print_biases(self):
     #             print('%10.3g' % (m.w.detach().sigmoid() * 2))  # shortcut weights
 
     def fuse(self):  # fuse model Conv2d() + BatchNorm2d() layers
-        print('Fusing layers...')
+        print('Fusing layers... ', end='')
         for m in self.model.modules():
             if type(m) is Conv:
                 m.conv = torch_utils.fuse_conv_and_bn(m.conv, m.bn)  # update conv
                 m.bn = None  # remove batchnorm
                 m.forward = m.fuseforward  # update forward
         torch_utils.model_info(self)
-
+        return self
 
 def parse_model(md, ch):  # model_dict, input_channels(3)
-    print('\n%3s%15s%3s%10s  %-40s%-30s' % ('', 'from', 'n', 'params', 'module', 'arguments'))
+    print('\n%3s%18s%3s%10s  %-40s%-30s' % ('', 'from', 'n', 'params', 'module', 'arguments'))
     anchors, nc, gd, gw = md['anchors'], md['nc'], md['depth_multiple'], md['width_multiple']
     na = (len(anchors[0]) // 2)  # number of anchors
     no = na * (nc + 5)  # number of outputs = anchors * (classes + 5)
@@ -161,7 +167,7 @@ def parse_model(md, ch):  # model_dict, input_channels(3)
                 pass
 
         n = max(round(n * gd), 1) if n > 1 else n  # depth gain
-        if m in [nn.Conv2d, Conv, Bottleneck, SPP, DWConv, MixConv2d, Focus, ConvPlus, BottleneckCSP]:
+        if m in [nn.Conv2d, Conv, Bottleneck, SPP, DWConv, MixConv2d, Focus, CrossConv, BottleneckCSP, C3]:
             c1, c2 = ch[f], args[0]
 
             # Normal
@@ -182,7 +188,7 @@ def parse_model(md, ch):  # model_dict, input_channels(3)
             #     c2 = make_divisible(c2, 8) if c2 != no else c2
 
             args = [c1, c2, *args[1:]]
-            if m is BottleneckCSP:
+            if m in [BottleneckCSP, C3]:
                 args.insert(2, n)
                 n = 1
         elif m is nn.BatchNorm2d:
@@ -198,7 +204,7 @@ def parse_model(md, ch):  # model_dict, input_channels(3)
         t = str(m)[8:-2].replace('__main__.', '')  # module type
         np = sum([x.numel() for x in m_.parameters()])  # number params
         m_.i, m_.f, m_.type, m_.np = i, f, t, np  # attach index, 'from' index, type, number params
-        print('%3s%15s%3s%10.0f  %-40s%-30s' % (i, f, n, np, t, args))  # print
+        print('%3s%18s%3s%10.0f  %-40s%-30s' % (i, f, n, np, t, args))  # print
         save.extend(x % i for x in ([f] if isinstance(f, int) else f) if x != -1)  # append to savelist
         layers.append(m_)
         ch.append(c2)
diff --git a/test.py b/test.py
index 259d44444bcd..1cfae9591287 100644
--- a/test.py
+++ b/test.py
@@ -22,6 +22,7 @@ def test(data,
     # Initialize/load model and set device
     if model is None:
         training = False
+        merge = opt.merge  # use Merge NMS
         device = torch_utils.select_device(opt.device, batch_size=batch_size)
 
         # Remove previous
@@ -30,11 +31,8 @@ def test(data,
 
         # Load model
         google_utils.attempt_download(weights)
-        model = torch.load(weights, map_location=device)['model'].float()  # load to FP32
-        torch_utils.model_info(model)
-        model.fuse()
-        model.to(device)
-        imgsz = check_img_size(imgsz, s=model.model[-1].stride.max())  # check img_size
+        model = torch.load(weights, map_location=device)['model'].float().fuse().to(device)  # load to FP32
+        imgsz = check_img_size(imgsz, s=model.stride.max())  # check img_size
 
         # Multi-GPU disabled, incompatible with .half() https://github.com/ultralytics/yolov5/issues/99
         # if device.type != 'cpu' and torch.cuda.device_count() > 1:
@@ -59,7 +57,6 @@ def test(data,
 
     # Dataloader
     if dataloader is None:  # not training
-        merge = opt.merge  # use Merge NMS
         img = torch.zeros((1, 3, imgsz, imgsz), device=device)  # init img
         _ = model(img.half() if half else img) if device.type != 'cpu' else None  # run once
         path = data['test'] if opt.task == 'test' else data['val']  # path to val/test images
diff --git a/train.py b/train.py
index eacd265d2bb7..b9b9f083d4d6 100644
--- a/train.py
+++ b/train.py
@@ -85,8 +85,7 @@ def train(hyp, tb_writer, opt, device):
             os.remove(f)
 
     # Create model
-    model = Model(opt.cfg).to(device)
-    assert model.md['nc'] == nc, '%s nc=%g classes but %s nc=%g classes' % (opt.data, nc, opt.cfg, model.md['nc'])
+    model = Model(opt.cfg, nc=data_dict['nc']).to(device)
 
     # Image sizes
     gs = int(max(model.stride))  # grid size (max stride)
@@ -117,6 +116,9 @@ def train(hyp, tb_writer, opt, device):
         optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
     optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
     optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
+    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
+    lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.9 + 0.1  # cosine
+    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
     print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0)))
     del pg0, pg1, pg2
 
@@ -162,9 +164,7 @@ def train(hyp, tb_writer, opt, device):
     if mixed_precision:
         model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)
 
-    # Scheduler https://arxiv.org/pdf/1812.01187.pdf
-    lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.9 + 0.1  # cosine
-    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
+
     scheduler.last_epoch = start_epoch - 1  # do not move
     # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822
     # plot_lr_scheduler(optimizer, scheduler, epochs)
@@ -382,7 +382,6 @@ def train(hyp, tb_writer, opt, device):
                 if (best_fitness == fi) and not final_epoch:
                     torch.save(ckpt, best)
                 del ckpt
-
         # end epoch ----------------------------------------------------------------------------------------------------
     # end training
 
diff --git a/utils/torch_utils.py b/utils/torch_utils.py
index 71c8f4c28539..786b01896d50 100644
--- a/utils/torch_utils.py
+++ b/utils/torch_utils.py
@@ -77,16 +77,36 @@ def find_modules(model, mclass=nn.Conv2d):
     return [i for i, m in enumerate(model.module_list) if isinstance(m, mclass)]
 
 
+def sparsity(model):
+    # Return global model sparsity
+    a, b = 0., 0.
+    for p in model.parameters():
+        a += p.numel()
+        b += (p == 0).sum()
+    return b / a
+
+
+def prune(model, amount=0.3):
+    # Prune model to requested global sparsity
+    import torch.nn.utils.prune as prune
+    print('Pruning model... ', end='')
+    for name, m in model.named_modules():
+        if isinstance(m, nn.Conv2d):
+            prune.l1_unstructured(m, name='weight', amount=amount)  # prune
+            prune.remove(m, 'weight')  # make permanent
+    print(' %.3g global sparsity' % sparsity(model))
+
+
 def fuse_conv_and_bn(conv, bn):
     # https://tehnokv.com/posts/fusing-batchnorm-and-conv/
     with torch.no_grad():
         # init
-        fusedconv = torch.nn.Conv2d(conv.in_channels,
-                                    conv.out_channels,
-                                    kernel_size=conv.kernel_size,
-                                    stride=conv.stride,
-                                    padding=conv.padding,
-                                    bias=True)
+        fusedconv = nn.Conv2d(conv.in_channels,
+                              conv.out_channels,
+                              kernel_size=conv.kernel_size,
+                              stride=conv.stride,
+                              padding=conv.padding,
+                              bias=True).to(conv.weight.device)
 
         # prepare filters
         w_conv = conv.weight.clone().view(conv.out_channels, -1)
@@ -94,10 +114,7 @@ def fuse_conv_and_bn(conv, bn):
         fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.size()))
 
         # prepare spatial bias
-        if conv.bias is not None:
-            b_conv = conv.bias
-        else:
-            b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device)
+        b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias
         b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
         fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
 
@@ -140,8 +157,8 @@ def load_classifier(name='resnet101', n=2):
 
     # Reshape output to n classes
     filters = model.fc.weight.shape[1]
-    model.fc.bias = torch.nn.Parameter(torch.zeros(n), requires_grad=True)
-    model.fc.weight = torch.nn.Parameter(torch.zeros(n, filters), requires_grad=True)
+    model.fc.bias = nn.Parameter(torch.zeros(n), requires_grad=True)
+    model.fc.weight = nn.Parameter(torch.zeros(n, filters), requires_grad=True)
     model.fc.out_features = n
     return model
 
@@ -176,21 +193,23 @@ class ModelEMA:
     """
 
     def __init__(self, model, decay=0.9999, device=''):
-        # make a copy of the model for accumulating moving average of weights
-        self.ema = deepcopy(model)
+        # Create EMA
+        self.ema = deepcopy(model.module if is_parallel(model) else model)  # FP32 EMA
         self.ema.eval()
         self.updates = 0  # number of EMA updates
         self.decay = lambda x: decay * (1 - math.exp(-x / 2000))  # decay exponential ramp (to help early epochs)
         self.device = device  # perform ema on different device from model if set
         if device:
-            self.ema.to(device=device)
+            self.ema.to(device)
         for p in self.ema.parameters():
             p.requires_grad_(False)
 
     def update(self, model):
-        self.updates += 1
-        d = self.decay(self.updates)
+        # Update EMA parameters
         with torch.no_grad():
+            self.updates += 1
+            d = self.decay(self.updates)
+
             msd = model.module.state_dict() if hasattr(model, 'module') else model.state_dict()
             esd = self.ema.module.state_dict() if hasattr(self.ema, 'module') else self.ema.state_dict()
             for k, v in esd.items():
@@ -200,13 +219,13 @@ def update(self, model):
 
     def update_attr(self, model):
         # Assign attributes (which may change during training)
-        for k in model.__dict__.keys():
+        for k, v in model.__dict__.items():
             # TODO: This is uglyy. Custom attributes should have some specific naming strategy.
-            if not (k.startswith('_') or k == 'module' or
-                isinstance(getattr(model, k), (torch.distributed.ProcessGroupNCCL, torch.distributed.Reducer))):
+            if not (k.startswith('_') or k in ["process_group", "reducer"] or
+                isinstance(v, (torch.distributed.ProcessGroupNCCL, torch.distributed.Reducer))):
                 try:
-                    pickle.dumps(getattr(model, k))
+                    pickle.dumps(v)
                 except Exception:
                     continue
                 else:
-                    setattr(self.ema, k, getattr(model, k))
+                    setattr(self.ema, k, v)
diff --git a/utils/utils.py b/utils/utils.py
index cef43a1446a2..4673fa5628e1 100755
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -451,7 +451,9 @@ def compute_loss(p, targets, model):  # predictions, targets, model
         BCEcls, BCEobj = FocalLoss(BCEcls, g), FocalLoss(BCEobj, g)
 
     # per output
-    nt = 0  # targets
+    nt = 0  # number of targets
+    np = len(p)  # number of outputs
+    balance = [1.0, 1.0, 1.0]
     for i, pi in enumerate(p):  # layer index, layer predictions
         b, a, gj, gi = indices[i]  # image, anchor, gridy, gridx
         tobj = torch.zeros_like(pi[..., 0])  # target obj
@@ -481,11 +483,12 @@ def compute_loss(p, targets, model):  # predictions, targets, model
             # with open('targets.txt', 'a') as file:
             #     [file.write('%11.5g ' * 4 % tuple(x) + '\n') for x in torch.cat((txy[i], twh[i]), 1)]
 
-        lobj += BCEobj(pi[..., 4], tobj)  # obj loss
+        lobj += BCEobj(pi[..., 4], tobj) * balance[i]  # obj loss
 
-    lbox *= h['giou']
-    lobj *= h['obj']
-    lcls *= h['cls']
+    s = 3 / np  # output count scaling
+    lbox *= h['giou'] * s
+    lobj *= h['obj'] * s
+    lcls *= h['cls'] * s
     bs = tobj.shape[0]  # batch size
     if red == 'sum':
         g = 3.0  # loss gain
@@ -524,16 +527,14 @@ def build_targets(p, targets, model):
             a, t = at[j], t.repeat(na, 1, 1)[j]  # filter
 
             # overlaps
+            g = 0.5  # offset
             gxy = t[:, 2:4]  # grid xy
             z = torch.zeros_like(gxy)
             if style == 'rect2':
-                g = 0.2  # offset
                 j, k = ((gxy % 1. < g) & (gxy > 1.)).T
                 a, t = torch.cat((a, a[j], a[k]), 0), torch.cat((t, t[j], t[k]), 0)
                 offsets = torch.cat((z, z[j] + off[0], z[k] + off[1]), 0) * g
-
             elif style == 'rect4':
-                g = 0.5  # offset
                 j, k = ((gxy % 1. < g) & (gxy > 1.)).T
                 l, m = ((gxy % 1. > (1 - g)) & (gxy < (gain[[2, 3]] - 1.))).T
                 a, t = torch.cat((a, a[j], a[k], a[l], a[m]), 0), torch.cat((t, t[j], t[k], t[l], t[m]), 0)
@@ -780,11 +781,11 @@ def print_results(k):
     wh0 = np.concatenate([l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)])  # wh
 
     # Filter
-    i = (wh0 < 4.0).any(1).sum()
+    i = (wh0 < 3.0).any(1).sum()
     if i:
         print('WARNING: Extremely small objects found. '
-              '%g of %g labels are < 4 pixels in width or height.' % (i, len(wh0)))
-    wh = wh0[(wh0 >= 4.0).any(1)]  # filter > 2 pixels
+              '%g of %g labels are < 3 pixels in width or height.' % (i, len(wh0)))
+    wh = wh0[(wh0 >= 2.0).any(1)]  # filter > 2 pixels
 
     # Kmeans calculation
     from scipy.cluster.vq import kmeans