From 594773f817e1dc306860be661358c335a44737f1 Mon Sep 17 00:00:00 2001
From: Bing Xu <antinucleon@gmail.com>
Date: Wed, 23 Sep 2015 23:35:17 -0600
Subject: [PATCH] [NDArray] add clip op

---
 example/notebooks/alexnet.ipynb      | 579 ---------------------------
 example/notebooks/cifar-recipe.ipynb |  57 +--
 python/mxnet/__init__.py             |   2 +
 python/mxnet/ndarray.py              |  13 +
 python/mxnet/optimizer.py            |  13 +-
 src/ndarray/ndarray.cc               |   3 +-
 src/ndarray/ndarray_function-inl.h   |   2 +
 src/ndarray/ndarray_function.h       |  10 +
 8 files changed, 70 insertions(+), 609 deletions(-)
 delete mode 100644 example/notebooks/alexnet.ipynb

diff --git a/example/notebooks/alexnet.ipynb b/example/notebooks/alexnet.ipynb
deleted file mode 100644
index e6f2ad94e296..000000000000
--- a/example/notebooks/alexnet.ipynb
+++ /dev/null
@@ -1,579 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Basic AlexNet Example\n",
-    "--------\n",
-    "\n",
-    "This notebook shows how to use MXNet construct AlexNet. AlexNet is made by Alex Krizhevsky in 2012.\n",
-    "\n",
-    "We  will show how to train AlexNet in Python with single/multi GPU. All you need is to write a piece of Python code to describe network, then MXNet will help you finish all work without any of your effort. \n",
-    "\n",
-    "Notice: This notebook is a basic demo to show MXNet flavor. To train a full state-of-art network, please refer our ```Inception``` example.\n",
-    "\n",
-    "Generally, we need \n",
-    "\n",
-    "- Declare symbol network\n",
-    "- Declare data iterator\n",
-    "- Bind symbol network to device to model\n",
-    "- Fit the model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "import mxnet as mx"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now we have successully load MXNet. we will start declare a symbolic network. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "input_data = mx.symbol.Variable(name=\"data\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We use a special symbol ```Variable``` to represent input data."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "# stage 1\n",
-    "conv1 = mx.symbol.Convolution(data=input_data, kernel=(11, 11), stride=(4, 4), num_filter=96)\n",
-    "relu1 = mx.symbol.Activation(data=conv1, act_type=\"relu\")\n",
-    "pool1 = mx.symbol.Pooling(data=relu1, pool_type=\"max\", kernel=(3, 3), stride=(2,2))\n",
-    "lrn1 = mx.symbol.LRN(data=pool1, alpha=0.0001, beta=0.75, knorm=1, nsize=5)\n",
-    "# stage 2\n",
-    "conv2 = mx.symbol.Convolution(data=lrn1, kernel=(5, 5), pad=(2, 2), num_filter=256)\n",
-    "relu2 = mx.symbol.Activation(data=conv2, act_type=\"relu\")\n",
-    "pool2 = mx.symbol.Pooling(data=relu2, kernel=(3, 3), stride=(2, 2))\n",
-    "lrn2 = mx.symbol.LRN(data=pool2, alpha=0.0001, beta=0.75, knorm=1, nsize=5)\n",
-    "# stage 3\n",
-    "conv3 = mx.symbol.Convolution(data=lrn2, kernel=(3, 3), pad=(1, 1), num_filter=384)\n",
-    "relu3 = mx.symbol.Activation(data=conv3, act_type=\"relu\")\n",
-    "conv4 = mx.symbol.Convolution(data=relu3, kernel=(3, 3), pad=(1, 1), num_filter=384)\n",
-    "relu4 = mx.symbol.Activation(data=conv4, act_type=\"relu\")\n",
-    "conv5 = mx.symbol.Convolution(data=relu4, kernel=(3, 3), pad=(1, 1), num_filter=256)\n",
-    "relu5 = mx.symbol.Activation(data=conv5, act_type=\"relu\")\n",
-    "pool3 = mx.symbol.Pooling(data=relu5, kernel=(3, 3), stride=(2, 2))\n",
-    "# stage 4\n",
-    "flatten = mx.symbol.Flatten(data=pool3)\n",
-    "fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096)\n",
-    "relu6 = mx.symbol.Activation(data=fc1, act_type=\"relu\")\n",
-    "dropout1 = mx.symbol.Dropout(data=relu6, p=0.5)\n",
-    "# stage 5\n",
-    "fc2 = mx.symbol.FullyConnected(data=dropout1, num_hidden=4096)\n",
-    "relu7 = mx.symbol.Activation(data=fc2, act_type=\"relu\")\n",
-    "dropout2 = mx.symbol.Dropout(data=relu7, p=0.5)\n",
-    "# stage 6\n",
-    "fc3 = mx.symbol.FullyConnected(data=dropout2, num_hidden=1000)\n",
-    "softmax = mx.symbol.Softmax(data=fc3)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now we have a AlexNet in symbolic level. The ```softmax``` symbol contains all network structures. By indicate ```data``` for each symbol, the last symbol composite all info we need. We can visualize our network structure. (require ```graphviz``` package)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "image/svg+xml": [
-       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
-       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
-       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
-       "<!-- Generated by graphviz version 2.38.0 (20140413.2041)\n",
-       " -->\n",
-       "<!-- Title: AlexNet Pages: 1 -->\n",
-       "<svg width=\"102pt\" height=\"2322pt\"\n",
-       " viewBox=\"0.00 0.00 102.00 2322.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
-       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 2318)\">\n",
-       "<title>AlexNet</title>\n",
-       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-2318 98,-2318 98,4 -4,4\"/>\n",
-       "<!-- null_0 -->\n",
-       "<g id=\"node1\" class=\"node\"><title>null_0</title>\n",
-       "<polygon fill=\"lightgrey\" stroke=\"black\" points=\"94,-58 -7.10543e-15,-58 -7.10543e-15,-0 94,-0 94,-58\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-25.3\" font-family=\"Times,serif\" font-size=\"14.00\">data</text>\n",
-       "</g>\n",
-       "<!-- Convolution_3 -->\n",
-       "<g id=\"node2\" class=\"node\"><title>Convolution_3</title>\n",
-       "<polygon fill=\"#4876ff\" stroke=\"#4876ff\" points=\"94,-152 -7.10543e-15,-152 -7.10543e-15,-94 94,-94 94,-152\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-126.8\" font-family=\"Times,serif\" font-size=\"14.00\">Convolution</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-111.8\" font-family=\"Times,serif\" font-size=\"14.00\">11x11/4, 96</text>\n",
-       "</g>\n",
-       "<!-- Convolution_3&#45;&gt;null_0 -->\n",
-       "<g id=\"edge1\" class=\"edge\"><title>Convolution_3&#45;&gt;null_0</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-83.7443C47,-75.2043 47,-66.2977 47,-58.2479\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-83.897 47,-93.8971 50.5001,-83.8971 43.5001,-83.897\"/>\n",
-       "</g>\n",
-       "<!-- Activation_4 -->\n",
-       "<g id=\"node3\" class=\"node\"><title>Activation_4</title>\n",
-       "<polygon fill=\"salmon\" stroke=\"salmon\" points=\"94,-246 -7.10543e-15,-246 -7.10543e-15,-188 94,-188 94,-246\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-220.8\" font-family=\"Times,serif\" font-size=\"14.00\">Activation</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-205.8\" font-family=\"Times,serif\" font-size=\"14.00\">relu</text>\n",
-       "</g>\n",
-       "<!-- Activation_4&#45;&gt;Convolution_3 -->\n",
-       "<g id=\"edge2\" class=\"edge\"><title>Activation_4&#45;&gt;Convolution_3</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-177.744C47,-169.204 47,-160.298 47,-152.248\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-177.897 47,-187.897 50.5001,-177.897 43.5001,-177.897\"/>\n",
-       "</g>\n",
-       "<!-- Pooling_5 -->\n",
-       "<g id=\"node4\" class=\"node\"><title>Pooling_5</title>\n",
-       "<polygon fill=\"#ee2c2c\" stroke=\"#ee2c2c\" points=\"94,-340 -7.10543e-15,-340 -7.10543e-15,-282 94,-282 94,-340\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-314.8\" font-family=\"Times,serif\" font-size=\"14.00\">Pooling</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-299.8\" font-family=\"Times,serif\" font-size=\"14.00\">max, 3x3/2</text>\n",
-       "</g>\n",
-       "<!-- Pooling_5&#45;&gt;Activation_4 -->\n",
-       "<g id=\"edge3\" class=\"edge\"><title>Pooling_5&#45;&gt;Activation_4</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-271.744C47,-263.204 47,-254.298 47,-246.248\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-271.897 47,-281.897 50.5001,-271.897 43.5001,-271.897\"/>\n",
-       "</g>\n",
-       "<!-- LRN_6 -->\n",
-       "<g id=\"node5\" class=\"node\"><title>LRN_6</title>\n",
-       "<polygon fill=\"#c0ff3e\" stroke=\"#c0ff3e\" points=\"94,-434 -7.10543e-15,-434 -7.10543e-15,-376 94,-376 94,-434\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-401.3\" font-family=\"Times,serif\" font-size=\"14.00\">LRN</text>\n",
-       "</g>\n",
-       "<!-- LRN_6&#45;&gt;Pooling_5 -->\n",
-       "<g id=\"edge4\" class=\"edge\"><title>LRN_6&#45;&gt;Pooling_5</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-365.744C47,-357.204 47,-348.298 47,-340.248\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-365.897 47,-375.897 50.5001,-365.897 43.5001,-365.897\"/>\n",
-       "</g>\n",
-       "<!-- Convolution_9 -->\n",
-       "<g id=\"node6\" class=\"node\"><title>Convolution_9</title>\n",
-       "<polygon fill=\"#4876ff\" stroke=\"#4876ff\" points=\"94,-528 -7.10543e-15,-528 -7.10543e-15,-470 94,-470 94,-528\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-502.8\" font-family=\"Times,serif\" font-size=\"14.00\">Convolution</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-487.8\" font-family=\"Times,serif\" font-size=\"14.00\">5x5/1, 256</text>\n",
-       "</g>\n",
-       "<!-- Convolution_9&#45;&gt;LRN_6 -->\n",
-       "<g id=\"edge5\" class=\"edge\"><title>Convolution_9&#45;&gt;LRN_6</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-459.744C47,-451.204 47,-442.298 47,-434.248\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-459.897 47,-469.897 50.5001,-459.897 43.5001,-459.897\"/>\n",
-       "</g>\n",
-       "<!-- Activation_10 -->\n",
-       "<g id=\"node7\" class=\"node\"><title>Activation_10</title>\n",
-       "<polygon fill=\"salmon\" stroke=\"salmon\" points=\"94,-622 -7.10543e-15,-622 -7.10543e-15,-564 94,-564 94,-622\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-596.8\" font-family=\"Times,serif\" font-size=\"14.00\">Activation</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-581.8\" font-family=\"Times,serif\" font-size=\"14.00\">relu</text>\n",
-       "</g>\n",
-       "<!-- Activation_10&#45;&gt;Convolution_9 -->\n",
-       "<g id=\"edge6\" class=\"edge\"><title>Activation_10&#45;&gt;Convolution_9</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-553.744C47,-545.204 47,-536.298 47,-528.248\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-553.897 47,-563.897 50.5001,-553.897 43.5001,-553.897\"/>\n",
-       "</g>\n",
-       "<!-- Pooling_11 -->\n",
-       "<g id=\"node8\" class=\"node\"><title>Pooling_11</title>\n",
-       "<polygon fill=\"#ee2c2c\" stroke=\"#ee2c2c\" points=\"94,-716 -7.10543e-15,-716 -7.10543e-15,-658 94,-658 94,-716\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-690.8\" font-family=\"Times,serif\" font-size=\"14.00\">Pooling</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-675.8\" font-family=\"Times,serif\" font-size=\"14.00\">max, 3x3/2</text>\n",
-       "</g>\n",
-       "<!-- Pooling_11&#45;&gt;Activation_10 -->\n",
-       "<g id=\"edge7\" class=\"edge\"><title>Pooling_11&#45;&gt;Activation_10</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-647.744C47,-639.204 47,-630.298 47,-622.248\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-647.897 47,-657.897 50.5001,-647.897 43.5001,-647.897\"/>\n",
-       "</g>\n",
-       "<!-- LRN_12 -->\n",
-       "<g id=\"node9\" class=\"node\"><title>LRN_12</title>\n",
-       "<polygon fill=\"#c0ff3e\" stroke=\"#c0ff3e\" points=\"94,-810 -7.10543e-15,-810 -7.10543e-15,-752 94,-752 94,-810\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-777.3\" font-family=\"Times,serif\" font-size=\"14.00\">LRN</text>\n",
-       "</g>\n",
-       "<!-- LRN_12&#45;&gt;Pooling_11 -->\n",
-       "<g id=\"edge8\" class=\"edge\"><title>LRN_12&#45;&gt;Pooling_11</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-741.744C47,-733.204 47,-724.298 47,-716.248\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-741.897 47,-751.897 50.5001,-741.897 43.5001,-741.897\"/>\n",
-       "</g>\n",
-       "<!-- Convolution_15 -->\n",
-       "<g id=\"node10\" class=\"node\"><title>Convolution_15</title>\n",
-       "<polygon fill=\"#4876ff\" stroke=\"#4876ff\" points=\"94,-904 -7.10543e-15,-904 -7.10543e-15,-846 94,-846 94,-904\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-878.8\" font-family=\"Times,serif\" font-size=\"14.00\">Convolution</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-863.8\" font-family=\"Times,serif\" font-size=\"14.00\">3x3/1, 384</text>\n",
-       "</g>\n",
-       "<!-- Convolution_15&#45;&gt;LRN_12 -->\n",
-       "<g id=\"edge9\" class=\"edge\"><title>Convolution_15&#45;&gt;LRN_12</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-835.744C47,-827.204 47,-818.298 47,-810.248\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-835.897 47,-845.897 50.5001,-835.897 43.5001,-835.897\"/>\n",
-       "</g>\n",
-       "<!-- Activation_16 -->\n",
-       "<g id=\"node11\" class=\"node\"><title>Activation_16</title>\n",
-       "<polygon fill=\"salmon\" stroke=\"salmon\" points=\"94,-998 -7.10543e-15,-998 -7.10543e-15,-940 94,-940 94,-998\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-972.8\" font-family=\"Times,serif\" font-size=\"14.00\">Activation</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-957.8\" font-family=\"Times,serif\" font-size=\"14.00\">relu</text>\n",
-       "</g>\n",
-       "<!-- Activation_16&#45;&gt;Convolution_15 -->\n",
-       "<g id=\"edge10\" class=\"edge\"><title>Activation_16&#45;&gt;Convolution_15</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-929.744C47,-921.204 47,-912.298 47,-904.248\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-929.897 47,-939.897 50.5001,-929.897 43.5001,-929.897\"/>\n",
-       "</g>\n",
-       "<!-- Convolution_19 -->\n",
-       "<g id=\"node12\" class=\"node\"><title>Convolution_19</title>\n",
-       "<polygon fill=\"#4876ff\" stroke=\"#4876ff\" points=\"94,-1092 -7.10543e-15,-1092 -7.10543e-15,-1034 94,-1034 94,-1092\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1066.8\" font-family=\"Times,serif\" font-size=\"14.00\">Convolution</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1051.8\" font-family=\"Times,serif\" font-size=\"14.00\">3x3/1, 384</text>\n",
-       "</g>\n",
-       "<!-- Convolution_19&#45;&gt;Activation_16 -->\n",
-       "<g id=\"edge11\" class=\"edge\"><title>Convolution_19&#45;&gt;Activation_16</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-1023.74C47,-1015.2 47,-1006.3 47,-998.248\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-1023.9 47,-1033.9 50.5001,-1023.9 43.5001,-1023.9\"/>\n",
-       "</g>\n",
-       "<!-- Activation_20 -->\n",
-       "<g id=\"node13\" class=\"node\"><title>Activation_20</title>\n",
-       "<polygon fill=\"salmon\" stroke=\"salmon\" points=\"94,-1186 -7.10543e-15,-1186 -7.10543e-15,-1128 94,-1128 94,-1186\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1160.8\" font-family=\"Times,serif\" font-size=\"14.00\">Activation</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1145.8\" font-family=\"Times,serif\" font-size=\"14.00\">relu</text>\n",
-       "</g>\n",
-       "<!-- Activation_20&#45;&gt;Convolution_19 -->\n",
-       "<g id=\"edge12\" class=\"edge\"><title>Activation_20&#45;&gt;Convolution_19</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-1117.74C47,-1109.2 47,-1100.3 47,-1092.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-1117.9 47,-1127.9 50.5001,-1117.9 43.5001,-1117.9\"/>\n",
-       "</g>\n",
-       "<!-- Convolution_23 -->\n",
-       "<g id=\"node14\" class=\"node\"><title>Convolution_23</title>\n",
-       "<polygon fill=\"#4876ff\" stroke=\"#4876ff\" points=\"94,-1280 -7.10543e-15,-1280 -7.10543e-15,-1222 94,-1222 94,-1280\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1254.8\" font-family=\"Times,serif\" font-size=\"14.00\">Convolution</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1239.8\" font-family=\"Times,serif\" font-size=\"14.00\">3x3/1, 256</text>\n",
-       "</g>\n",
-       "<!-- Convolution_23&#45;&gt;Activation_20 -->\n",
-       "<g id=\"edge13\" class=\"edge\"><title>Convolution_23&#45;&gt;Activation_20</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-1211.74C47,-1203.2 47,-1194.3 47,-1186.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-1211.9 47,-1221.9 50.5001,-1211.9 43.5001,-1211.9\"/>\n",
-       "</g>\n",
-       "<!-- Activation_24 -->\n",
-       "<g id=\"node15\" class=\"node\"><title>Activation_24</title>\n",
-       "<polygon fill=\"salmon\" stroke=\"salmon\" points=\"94,-1374 -7.10543e-15,-1374 -7.10543e-15,-1316 94,-1316 94,-1374\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1348.8\" font-family=\"Times,serif\" font-size=\"14.00\">Activation</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1333.8\" font-family=\"Times,serif\" font-size=\"14.00\">relu</text>\n",
-       "</g>\n",
-       "<!-- Activation_24&#45;&gt;Convolution_23 -->\n",
-       "<g id=\"edge14\" class=\"edge\"><title>Activation_24&#45;&gt;Convolution_23</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-1305.74C47,-1297.2 47,-1288.3 47,-1280.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-1305.9 47,-1315.9 50.5001,-1305.9 43.5001,-1305.9\"/>\n",
-       "</g>\n",
-       "<!-- Pooling_25 -->\n",
-       "<g id=\"node16\" class=\"node\"><title>Pooling_25</title>\n",
-       "<polygon fill=\"#ee2c2c\" stroke=\"#ee2c2c\" points=\"94,-1468 -7.10543e-15,-1468 -7.10543e-15,-1410 94,-1410 94,-1468\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1442.8\" font-family=\"Times,serif\" font-size=\"14.00\">Pooling</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1427.8\" font-family=\"Times,serif\" font-size=\"14.00\">max, 3x3/2</text>\n",
-       "</g>\n",
-       "<!-- Pooling_25&#45;&gt;Activation_24 -->\n",
-       "<g id=\"edge15\" class=\"edge\"><title>Pooling_25&#45;&gt;Activation_24</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-1399.74C47,-1391.2 47,-1382.3 47,-1374.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-1399.9 47,-1409.9 50.5001,-1399.9 43.5001,-1399.9\"/>\n",
-       "</g>\n",
-       "<!-- Flatten_26 -->\n",
-       "<g id=\"node17\" class=\"node\"><title>Flatten_26</title>\n",
-       "<polygon fill=\"#54ff9f\" stroke=\"#54ff9f\" points=\"94,-1562 -7.10543e-15,-1562 -7.10543e-15,-1504 94,-1504 94,-1562\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1529.3\" font-family=\"Times,serif\" font-size=\"14.00\">Flatten</text>\n",
-       "</g>\n",
-       "<!-- Flatten_26&#45;&gt;Pooling_25 -->\n",
-       "<g id=\"edge16\" class=\"edge\"><title>Flatten_26&#45;&gt;Pooling_25</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-1493.74C47,-1485.2 47,-1476.3 47,-1468.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-1493.9 47,-1503.9 50.5001,-1493.9 43.5001,-1493.9\"/>\n",
-       "</g>\n",
-       "<!-- FullyConnected_29 -->\n",
-       "<g id=\"node18\" class=\"node\"><title>FullyConnected_29</title>\n",
-       "<polygon fill=\"#4876ff\" stroke=\"#4876ff\" points=\"94,-1656 -7.10543e-15,-1656 -7.10543e-15,-1598 94,-1598 94,-1656\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1630.8\" font-family=\"Times,serif\" font-size=\"14.00\">FullyConnected</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1615.8\" font-family=\"Times,serif\" font-size=\"14.00\">4096</text>\n",
-       "</g>\n",
-       "<!-- FullyConnected_29&#45;&gt;Flatten_26 -->\n",
-       "<g id=\"edge17\" class=\"edge\"><title>FullyConnected_29&#45;&gt;Flatten_26</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-1587.74C47,-1579.2 47,-1570.3 47,-1562.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-1587.9 47,-1597.9 50.5001,-1587.9 43.5001,-1587.9\"/>\n",
-       "</g>\n",
-       "<!-- Activation_30 -->\n",
-       "<g id=\"node19\" class=\"node\"><title>Activation_30</title>\n",
-       "<polygon fill=\"salmon\" stroke=\"salmon\" points=\"94,-1750 -7.10543e-15,-1750 -7.10543e-15,-1692 94,-1692 94,-1750\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1724.8\" font-family=\"Times,serif\" font-size=\"14.00\">Activation</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1709.8\" font-family=\"Times,serif\" font-size=\"14.00\">relu</text>\n",
-       "</g>\n",
-       "<!-- Activation_30&#45;&gt;FullyConnected_29 -->\n",
-       "<g id=\"edge18\" class=\"edge\"><title>Activation_30&#45;&gt;FullyConnected_29</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-1681.74C47,-1673.2 47,-1664.3 47,-1656.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-1681.9 47,-1691.9 50.5001,-1681.9 43.5001,-1681.9\"/>\n",
-       "</g>\n",
-       "<!-- Dropout_31 -->\n",
-       "<g id=\"node20\" class=\"node\"><title>Dropout_31</title>\n",
-       "<polygon fill=\"#c0ff3e\" stroke=\"#c0ff3e\" points=\"94,-1844 -7.10543e-15,-1844 -7.10543e-15,-1786 94,-1786 94,-1844\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1811.3\" font-family=\"Times,serif\" font-size=\"14.00\">Dropout</text>\n",
-       "</g>\n",
-       "<!-- Dropout_31&#45;&gt;Activation_30 -->\n",
-       "<g id=\"edge19\" class=\"edge\"><title>Dropout_31&#45;&gt;Activation_30</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-1775.74C47,-1767.2 47,-1758.3 47,-1750.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-1775.9 47,-1785.9 50.5001,-1775.9 43.5001,-1775.9\"/>\n",
-       "</g>\n",
-       "<!-- FullyConnected_34 -->\n",
-       "<g id=\"node21\" class=\"node\"><title>FullyConnected_34</title>\n",
-       "<polygon fill=\"#4876ff\" stroke=\"#4876ff\" points=\"94,-1938 -7.10543e-15,-1938 -7.10543e-15,-1880 94,-1880 94,-1938\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1912.8\" font-family=\"Times,serif\" font-size=\"14.00\">FullyConnected</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1897.8\" font-family=\"Times,serif\" font-size=\"14.00\">4096</text>\n",
-       "</g>\n",
-       "<!-- FullyConnected_34&#45;&gt;Dropout_31 -->\n",
-       "<g id=\"edge20\" class=\"edge\"><title>FullyConnected_34&#45;&gt;Dropout_31</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-1869.74C47,-1861.2 47,-1852.3 47,-1844.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-1869.9 47,-1879.9 50.5001,-1869.9 43.5001,-1869.9\"/>\n",
-       "</g>\n",
-       "<!-- Activation_35 -->\n",
-       "<g id=\"node22\" class=\"node\"><title>Activation_35</title>\n",
-       "<polygon fill=\"salmon\" stroke=\"salmon\" points=\"94,-2032 -7.10543e-15,-2032 -7.10543e-15,-1974 94,-1974 94,-2032\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-2006.8\" font-family=\"Times,serif\" font-size=\"14.00\">Activation</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-1991.8\" font-family=\"Times,serif\" font-size=\"14.00\">relu</text>\n",
-       "</g>\n",
-       "<!-- Activation_35&#45;&gt;FullyConnected_34 -->\n",
-       "<g id=\"edge21\" class=\"edge\"><title>Activation_35&#45;&gt;FullyConnected_34</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-1963.74C47,-1955.2 47,-1946.3 47,-1938.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-1963.9 47,-1973.9 50.5001,-1963.9 43.5001,-1963.9\"/>\n",
-       "</g>\n",
-       "<!-- Dropout_36 -->\n",
-       "<g id=\"node23\" class=\"node\"><title>Dropout_36</title>\n",
-       "<polygon fill=\"#c0ff3e\" stroke=\"#c0ff3e\" points=\"94,-2126 -7.10543e-15,-2126 -7.10543e-15,-2068 94,-2068 94,-2126\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-2093.3\" font-family=\"Times,serif\" font-size=\"14.00\">Dropout</text>\n",
-       "</g>\n",
-       "<!-- Dropout_36&#45;&gt;Activation_35 -->\n",
-       "<g id=\"edge22\" class=\"edge\"><title>Dropout_36&#45;&gt;Activation_35</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-2057.74C47,-2049.2 47,-2040.3 47,-2032.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-2057.9 47,-2067.9 50.5001,-2057.9 43.5001,-2057.9\"/>\n",
-       "</g>\n",
-       "<!-- FullyConnected_39 -->\n",
-       "<g id=\"node24\" class=\"node\"><title>FullyConnected_39</title>\n",
-       "<polygon fill=\"#4876ff\" stroke=\"#4876ff\" points=\"94,-2220 -7.10543e-15,-2220 -7.10543e-15,-2162 94,-2162 94,-2220\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-2194.8\" font-family=\"Times,serif\" font-size=\"14.00\">FullyConnected</text>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-2179.8\" font-family=\"Times,serif\" font-size=\"14.00\">1000</text>\n",
-       "</g>\n",
-       "<!-- FullyConnected_39&#45;&gt;Dropout_36 -->\n",
-       "<g id=\"edge23\" class=\"edge\"><title>FullyConnected_39&#45;&gt;Dropout_36</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-2151.74C47,-2143.2 47,-2134.3 47,-2126.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-2151.9 47,-2161.9 50.5001,-2151.9 43.5001,-2151.9\"/>\n",
-       "</g>\n",
-       "<!-- Softmax_41 -->\n",
-       "<g id=\"node25\" class=\"node\"><title>Softmax_41</title>\n",
-       "<polygon fill=\"#c0ff3e\" stroke=\"#c0ff3e\" points=\"94,-2314 -7.10543e-15,-2314 -7.10543e-15,-2256 94,-2256 94,-2314\"/>\n",
-       "<text text-anchor=\"middle\" x=\"47\" y=\"-2281.3\" font-family=\"Times,serif\" font-size=\"14.00\">Softmax</text>\n",
-       "</g>\n",
-       "<!-- Softmax_41&#45;&gt;FullyConnected_39 -->\n",
-       "<g id=\"edge24\" class=\"edge\"><title>Softmax_41&#45;&gt;FullyConnected_39</title>\n",
-       "<path fill=\"none\" stroke=\"black\" d=\"M47,-2245.74C47,-2237.2 47,-2228.3 47,-2220.25\"/>\n",
-       "<polygon fill=\"black\" stroke=\"black\" points=\"43.5001,-2245.9 47,-2255.9 50.5001,-2245.9 43.5001,-2245.9\"/>\n",
-       "</g>\n",
-       "</g>\n",
-       "</svg>\n"
-      ],
-      "text/plain": [
-       "<graphviz.dot.Digraph at 0x7f08a1121198>"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "mx.viz.plot_network(softmax)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
-   "source": [
-    "The next step is declare data iterator. We provide high perfomance RecordIO image iterator for ImageNet task. Please pack the images into record file before use. For how to pack image and more details about image data iterator and build-in io iterator, please read [io doc](https://github.com/dmlc/mxnet/blob/master/doc/python/io.md)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# We set batch size for to 256\n",
-    "batch_size = 256\n",
-    "# We need to set correct path to image record file\n",
-    "# For ```mean_image```. if it doesn't exist, the iterator will generate one\n",
-    "# On HDD, single thread is able to process 800 images / sec\n",
-    "# the input shape is in format (channel, height, width)\n",
-    "# rand_crop option make source image randomly cropped to input_shape (3, 224, 224)\n",
-    "# rand_mirror option make source image randomly mirrored\n",
-    "# We use 2 threads to processing our data\n",
-    "train_dataiter = mx.io.ImageRecordIter(\n",
-    "    shuffle=True,\n",
-    "    path_imgrec=\"./Data/ImageNet/train.rec\",\n",
-    "    mean_img=\"./Data/ImageNet/mean_224.bin\",\n",
-    "    rand_crop=True,\n",
-    "    rand_mirror=True,\n",
-    "    data_shape=(3, 224, 224),\n",
-    "    batch_size=batch_size,\n",
-    "    prefetch_buffer=4,\n",
-    "    preprocess_threads=2)\n",
-    "# similarly, we can declare our validation iterator\n",
-    "val_dataiter = mx.io.ImageRecordIter(\n",
-    "    path_imgrec=\"./Data/ImageNet/val.rec\",\n",
-    "    mean_img=\"./Data/ImageNet/mean_224.bin\",\n",
-    "    rand_crop=False,\n",
-    "    rand_mirror=False,\n",
-    "    data_shape=(3, 224, 224),\n",
-    "    batch_size=batch_size,\n",
-    "    prefetch_buffer=4,\n",
-    "    preprocess_threads=2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Next step, we will initialize our model from symbol. To run on a single GPU, we need to declare:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "# For demo purpose, we just run 1 epoch\n",
-    "num_round = 1\n",
-    "# set context to GPU, if you want to use cpu, set it to mx.cpu()\n",
-    "ctx = mx.gpu() \n",
-    "# note: for input shape in model, we must contain batch size\n",
-    "data_shape = (batch_size, 3, 224, 224)\n",
-    "\n",
-    "model = mx.model.FeedForward(symbol=softmax, ctx=ctx, input_shape=data_shape, num_round=num_round,\n",
-    "                             learning_rate=0.01, momentum=0.9, wd=0.0001)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To run on multiply GPU, we need to declare"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "# For demo purpose, we just run 1 epoch\n",
-    "num_round = 1\n",
-    "# Assume we have 4 GPU, we can make a context list contains 4 device\n",
-    "num_devs = 4\n",
-    "ctx = [mx.gpu(i) for i in range(num_devs)]\n",
-    "# note: for input shape in model, we must contain batch size\n",
-    "data_shape = (batch_size, 3, 224, 224)\n",
-    "\n",
-    "model = mx.model.FeedForward(symbol=softmax, ctx=ctx, input_shape=data_shape, num_round=num_round,\n",
-    "                             learning_rate=0.01, momentum=0.9, wd=0.0001)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "ename": "MXNetError",
-     "evalue": "[12:00:28] src/ndarray/ndarray.cc:157: Check failed: from.shape() == to->shape() operands shape mismatch",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mMXNetError\u001b[0m                                Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-10-8ca28bf9d513>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;31m# In this case, eval_data is also a data iterator\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      4\u001b[0m \u001b[1;31m# We will use accuracy to measure our model's performace\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mtrain_dataiter\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0meval_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mval_dataiter\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0meval_metric\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'acc'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/model.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, eval_data, eval_metric, verbose)\u001b[0m\n\u001b[0;32m    304\u001b[0m                \u001b[0mtrain_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0meval_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0meval_data\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    305\u001b[0m                \u001b[0meval_metric\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0meval_metric\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 306\u001b[1;33m                verbose=verbose)\n\u001b[0m",
-      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/model.py\u001b[0m in \u001b[0;36m_train\u001b[1;34m(symbol, ctx, input_shape, arg_params, aux_params, begin_round, end_round, optimizer, train_data, eval_data, eval_metric, iter_end_callback, verbose)\u001b[0m\n\u001b[0;32m     85\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mweight\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mzip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marg_names\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0marg_arrays\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     86\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;32min\u001b[0m \u001b[0marg_params\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 87\u001b[1;33m             \u001b[0marg_params\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcopyto\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mweight\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     88\u001b[0m     \u001b[1;32mfor\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mweight\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mlist\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mzip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0maux_names\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maux_arrays\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     89\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mkey\u001b[0m \u001b[1;32min\u001b[0m \u001b[0maux_params\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/ndarray.py\u001b[0m in \u001b[0;36mcopyto\u001b[1;34m(self, other)\u001b[0m\n\u001b[0;32m    306\u001b[0m                               RuntimeWarning)\n\u001b[0;32m    307\u001b[0m                 \u001b[1;32mreturn\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 308\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mNDArray\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_copyto\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mother\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    309\u001b[0m         \u001b[1;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mother\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mContext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    310\u001b[0m             \u001b[0mhret\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mNDArray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_new_alloc_handle\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mother\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/ndarray.py\u001b[0m in \u001b[0;36mgeneric_ndarray_function\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    618\u001b[0m                 \u001b[0mc_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mNDArrayHandle\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhandle\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0muse_vars_range\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    619\u001b[0m                 \u001b[0mc_array\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmx_float\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mscalar_range\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m\\\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 620\u001b[1;33m                 c_array(NDArrayHandle, [v.handle for v in mutate_vars])))\n\u001b[0m\u001b[0;32m    621\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mn_mutate_vars\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    622\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mmutate_vars\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/base.py\u001b[0m in \u001b[0;36mcheck_call\u001b[1;34m(ret)\u001b[0m\n\u001b[0;32m     95\u001b[0m     \"\"\"\n\u001b[0;32m     96\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mret\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 97\u001b[1;33m         \u001b[1;32mraise\u001b[0m \u001b[0mMXNetError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpy_str\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_LIB\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mMXGetLastError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     98\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     99\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mc_str\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstring\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
-      "\u001b[1;31mMXNetError\u001b[0m: [12:00:28] src/ndarray/ndarray.cc:157: Check failed: from.shape() == to->shape() operands shape mismatch"
-     ]
-    }
-   ],
-   "source": [
-    "# Now we can fit the model with data iterators\n",
-    "# When we use data iterator, we don't need to set y because label comes from data iterator directly\n",
-    "# In this case, eval_data is also a data iterator\n",
-    "# We will use accuracy to measure our model's performace\n",
-    "model.fit(X=train_dataiter, eval_data=val_dataiter, eval_metric='acc')\n",
-    "# You need to wait for a while to get the result"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "That's all!"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.4.2"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/example/notebooks/cifar-recipe.ipynb b/example/notebooks/cifar-recipe.ipynb
index fccdfcb47e43..05097b026042 100644
--- a/example/notebooks/cifar-recipe.ipynb
+++ b/example/notebooks/cifar-recipe.ipynb
@@ -237,7 +237,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {
     "collapsed": false
    },
@@ -247,16 +247,16 @@
      "output_type": "stream",
      "text": [
       "INFO:root:Start training with [gpu(0)]\n",
-      "INFO:root:Batch [50]\tSpeed: 1091.84 samples/sec\n",
-      "INFO:root:Batch [100]\tSpeed: 1084.80 samples/sec\n",
-      "INFO:root:Batch [150]\tSpeed: 1084.55 samples/sec\n",
-      "INFO:root:Batch [200]\tSpeed: 1077.30 samples/sec\n",
-      "INFO:root:Batch [250]\tSpeed: 1074.73 samples/sec\n",
-      "INFO:root:Batch [300]\tSpeed: 1075.67 samples/sec\n",
-      "INFO:root:Batch [350]\tSpeed: 1067.09 samples/sec\n",
-      "INFO:root:Iteration[0] Train-accuracy=0.525695\n",
-      "INFO:root:Iteration[0] Time cost=47.012\n",
-      "INFO:root:Iteration[0] Validation-accuracy=0.660008\n"
+      "INFO:root:Batch [50]\tSpeed: 1003.50 samples/sec\n",
+      "INFO:root:Batch [100]\tSpeed: 976.31 samples/sec\n",
+      "INFO:root:Batch [150]\tSpeed: 975.57 samples/sec\n",
+      "INFO:root:Batch [200]\tSpeed: 964.21 samples/sec\n",
+      "INFO:root:Batch [250]\tSpeed: 963.53 samples/sec\n",
+      "INFO:root:Batch [300]\tSpeed: 963.95 samples/sec\n",
+      "INFO:root:Batch [350]\tSpeed: 963.71 samples/sec\n",
+      "INFO:root:Iteration[0] Train-accuracy=0.520520\n",
+      "INFO:root:Iteration[0] Time cost=52.424\n",
+      "INFO:root:Iteration[0] Validation-accuracy=0.652393\n"
      ]
     }
    ],
@@ -272,14 +272,14 @@
     "#           eval_data=test_dataiter,\n",
     "#           eval_metric=\"accuracy\",\n",
     "#           epoch_end_callback=mx.helper.Speedometer(batch_size),\n",
-    "#           iter_end_callback=mx.model.do_checkpoint(model_prefix))\n"
+    "#           iter_end_callback=mx.callback.do_checkpoint(model_prefix))\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "After only 1 epoch, our model is able to acheive about 66% accuracy on testset.\n",
+    "After only 1 epoch, our model is able to acheive about 65% accuracy on testset.\n",
     "We can save our model by calling either ```save``` or using ```pickle```.\n"
    ]
   },
@@ -348,7 +348,7 @@
      "output_type": "stream",
      "text": [
       "INFO:root:Finish predict...\n",
-      "INFO:root:final accuracy = 0.651000\n"
+      "INFO:root:final accuracy = 0.652600\n"
      ]
     }
    ],
@@ -385,33 +385,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 14,
    "metadata": {
     "collapsed": false
    },
    "outputs": [
     {
-     "ename": "TypeError",
-     "evalue": "Symbol only support integer index to fetch i-th output",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "\u001b[1;32m<ipython-input-17-0e3d13f4a151>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0minternals\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msoftmax\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_internals\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mfea_symbol\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0minternals\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"global_avg_output\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m feature_extractor = mx.model.FeedForward(ctx=mx.gpu(), symbol=group, \n",
-      "\u001b[1;32m/home/bing/wtf/mxnet/python/mxnet/symbol.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, index)\u001b[0m\n\u001b[0;32m    156\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__getitem__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    157\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 158\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'Symbol only support integer index to fetch i-th output'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    159\u001b[0m         \u001b[0mhandle\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mSymbolHandle\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    160\u001b[0m         check_call(_LIB.MXSymbolGetOutput(\n",
-      "\u001b[1;31mTypeError\u001b[0m: Symbol only support integer index to fetch i-th output"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(10000, 336, 1, 1)\n"
      ]
     }
    ],
    "source": [
-    "# predict internal featuremaps\n",
+    "# Predict internal featuremaps\n",
+    "# From a symbol, we are able to get all internals. Note it is still a symbol\n",
     "internals = softmax.get_internals()\n",
-    "\n",
+    "# We get get an internal symbol for the feature.\n",
+    "# By default, the symbol is named as \"symbol_name + _output\"\n",
+    "# in this case we'd like to get global_avg\" layer's output as feature, so its \"global_avg_output\"\n",
+    "# You may call ```internals.list_outputs()``` to find the target\n",
+    "# but we strongly suggests set a special name for special symbol \n",
     "fea_symbol = internals[\"global_avg_output\"]\n",
     "\n",
-    "feature_extractor = mx.model.FeedForward(ctx=mx.gpu(), symbol=group, \n",
+    "# Make a new model by using an internal symbol. We can reuse all parameters from model we trained before\n",
+    "# In this case, we must set ```allow_extra_params``` to True\n",
+    "feature_extractor = mx.model.FeedForward(ctx=mx.gpu(), symbol=fea_symbol, \n",
     "                                         arg_params=model.arg_params, aux_params=model.aux_params,\n",
     "                                         allow_extra_params=True)\n",
+    "# Predict as normal\n",
     "global_pooling_feature = feature_extractor.predict(test_dataiter)\n",
     "print(global_pooling_feature.shape)"
    ]
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 7bca6efbb46d..b87b9dad924c 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -22,6 +22,8 @@
 from . import optimizer
 from . import model
 from . import initializer
+# use mx.init as short for mx.initializer
+from . import initializer as init
 from . import visualization
 # use viz as short for mx.ndarray
 from . import visualization as viz
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray.py
index 2e9659bfaf2c..150210fc09e3 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray.py
@@ -243,6 +243,16 @@ def _slice(self, start, stop):
             self.handle, start, stop, ctypes.byref(handle)))
         return NDArray(handle=handle, writable=self.writable)
 
+    def clip(self, value):
+        """Clip NDArray to range [-value, value] and remove NaN
+
+        Parameters
+        ----------
+        value: float
+            cliped range
+        """
+        return NDArray._clip_scalar(self, float(value))
+
     def wait_to_read(self):
         """Block until all pending writes operations on current NDArray are finished.
 
@@ -636,6 +646,9 @@ def generic_ndarray_function(*args, **kwargs):
     ret_function.__name__ = func_name
     ret_function.__doc__ = doc_str
     return ret_function
+
+
+
 # pylint: enable=too-many-locals, invalid-name
 
 def _init_ndarray_module():
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 5dc444e21620..6c317f493d02 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -34,14 +34,19 @@ class SGD(Optimizer):
 
     rescale_grad : float, optional
         rescaling factor of gradient.
+
+    clip_gradient : float, optional
+        clip gradient in range [-clip_gradient, clip_gradient]
     """
     def __init__(self, learning_rate=0.01, momentum=0.0,
-                 wd=0.0001, rescale_grad=1, lr_scheduler=None):
+                 wd=0.0001, rescale_grad=1, clip_gradient=None,
+                 lr_scheduler=None):
         super(SGD, self).__init__()
         self.lr = learning_rate
         self.momentum = momentum
         self.wd = wd
         self.rescale_grad = rescale_grad
+        self.clip_gradient = clip_gradient
         self.lr_scheduler = lr_scheduler
         if lr_scheduler != None:
             self.lr_scheduler.base_lr = learning_rate
@@ -89,7 +94,11 @@ def update(self, index, weight, grad, state):
         if state:
             mom = state
             mom[:] *= self.momentum
-            mom[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
+            if self.clip_gradient == None:
+                mom[:] += -lr * (grad * self.rescale_grad + self.wd * weight)
+            else:
+                mom[:] += -lr * (grad.clip(self.clip_gradient) * self.rescale_grad +
+                                 self.wd * weight)
             weight[:] += mom
         else:
             assert self.momentum == 0.0
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 210d4b7926f3..335baef17198 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -315,6 +315,7 @@ NDArray operator*(const NDArray &lhs, const real_t &rhs) {
 NDArray operator/(const NDArray &lhs, const real_t &rhs) {
   return ScalarOpRet<ndarray::Div, false>(lhs, rhs);
 }
+
 // Binary
 NDArray &NDArray::operator=(real_t scalar) {
   SetValueOp(scalar, this);
@@ -510,7 +511,7 @@ MXNET_REGISTER_NDARRAY_FUN(_plus_scalar).set_function(ScalarOp<ndarray::Plus, fa
 MXNET_REGISTER_NDARRAY_FUN(_minus_scalar).set_function(ScalarOp<ndarray::Minus, false>);
 MXNET_REGISTER_NDARRAY_FUN(_mul_scalar).set_function(ScalarOp<ndarray::Mul, false>);
 MXNET_REGISTER_NDARRAY_FUN(_div_scalar).set_function(ScalarOp<ndarray::Div, false>);
-
+MXNET_REGISTER_NDARRAY_FUN(_clip_scalar).set_function(ScalarOp<ndarray::Clip, false>);
 // register API function
 // scalar, reverse scalar
 MXNET_REGISTER_NDARRAY_FUN(_rminus_scalar).set_function(ScalarOp<ndarray::Minus, true>);
diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h
index 34a81af1bb39..604173c9e68a 100644
--- a/src/ndarray/ndarray_function-inl.h
+++ b/src/ndarray/ndarray_function-inl.h
@@ -100,11 +100,13 @@ DECL_SCALAR(DEVICE, Plus, EvalScalar_, true)
 DECL_SCALAR(DEVICE, Minus, EvalScalar_, true)
 DECL_SCALAR(DEVICE, Mul, EvalScalar_, true)
 DECL_SCALAR(DEVICE, Div, EvalScalar_, true)
+DECL_SCALAR(DEVICE, Clip, EvalScalar_, true)
 // for reverse seq
 DECL_SCALAR(DEVICE, Plus, EvalScalar_, false)
 DECL_SCALAR(DEVICE, Minus, EvalScalar_, false)
 DECL_SCALAR(DEVICE, Mul, EvalScalar_, false)
 DECL_SCALAR(DEVICE, Div, EvalScalar_, false)
+DECL_SCALAR(DEVICE, Clip, EvalScalar_, false)
 }  // namespace ndarray
 }  // namespace mxnet
 
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index a54766c75002..e7a405b2b115 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -37,6 +37,16 @@ struct Div : public BinaryBase {
   typedef mshadow::op::div mshadow_op;
 };
 
+struct Clip : public BinaryBase {
+  struct mshadow_op {
+    MSHADOW_XINLINE static real_t Map(real_t a, real_t b) {
+      if (isnan(a)) return 0.0f;
+      if (a < -b) return -b;
+      if (a > b) return b;
+      return a;
+    }
+  };
+};
 // type holder for random number generators
 struct UniformDistribution {};