diff --git a/BP Algorithm.ipynb b/BP Algorithm.ipynb new file mode 100644 index 0000000..de98784 --- /dev/null +++ b/BP Algorithm.ipynb @@ -0,0 +1,545 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BP Algorithm\n", + "谢昊君 1200015556 元培学院" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# k层BP 算法推导\n", + "$W^{(k)}$表示k-1层到第k层的权重矩阵,$z^i=f(W^i\\cdot a^{i-1}+b)$,或每次将$a$加入单位向量,使原式变为$z^i =[f(W^i\\cdot a^{i-1});1]$\n", + "\n", + "$$a^{(i+1)}=f(z^{(i+1)})$$\n", + "\n", + "激活函数满足$f'(z)=f(z)(1-f(z))$\n", + "\n", + "$h_{W,b}(x)$表示该神经网络对x特征的预测\n", + "\n", + "损失函数为:\n", + "$$J(W,b; x,y) = \\frac{1}{2} \\left\\| h_{W,b}(x) - y \\right\\|^2.$$\n", + "对m个输入样本而言,总损失函数为:\n", + "$$J(W,b)=\\frac{1}{m}\\sum_i ^m J(W,b;x^{(i)},y^{(i)})$$\n", + "\n", + "对该损失函数按梯度最优化更新权重矩阵$W$,即\n", + "\\begin{align}\n", + "W_{ij}^{(l)} &= W_{ij}^{(l)} - \\alpha \\frac{\\partial}{\\partial W_{ij}^{(l)}} J(W,b) \\\\\n", + "b_{i}^{(l)} &= b_{i}^{(l)} - \\alpha \\frac{\\partial}{\\partial b_{i}^{(l)}} J(W,b)\n", + "\\end{align}\n", + "\n", + "接下来求$\\frac{\\partial}{\\partial W_{ij}^{(l)}} J(W,b) $.\n", + "\n", + "首先考虑损失函数对神经元的导数,每一个神经元对loss function的影响/贡献,定义$\\delta_i ^k$ 为第k层第i个神经元对loss function的贡献\n", + "\n", + "于是有\n", + "$$\\delta_i ^k = \\frac{\\partial}{\\partial z_i ^k} \\frac{1}{2} \\left\\| h_{W,b}(x) - y \\right\\|^2 = - (y_i - a^{(n_l)}_i) \\cdot f'(z^{(n_l)}_i)\n", + "$$\n", + "\n", + "继续向内层求导则有:\n", + "\n", + "$$\\delta^{(l)}_i = \\left( \\sum_{j=1}^{s_{l+1}} W^{(l)}_{ji} \\delta^{(l+1)}_j \\right) f'(z^{(l)}_i)$$\n", + "\n", + "于是根据链式法则对权重$W_ij$和$b$有\n", + "$$ \\frac{\\partial}{\\partial W_{ij}^{(l)}} J(W,b; x, y) = a^{(l)}_j \\delta_i^{(l+1)}$$\n", + "$$ \\frac{\\partial}{\\partial b_{i}^{(l)}} J(W,b; x, y) = \\delta_i^{(l+1)}$$\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import numpy.random as rd" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# def activation function\n", + "def sigmoid(x):\n", + " return 1.0/(1.0 + np.exp(-x))\n", + "\n", + "def sigmoid_prime(x):\n", + " return sigmoid(x)*(1.0-sigmoid(x))\n", + "\n", + "def tanh(x):\n", + " return np.tanh(x)\n", + "\n", + "def tanh_prime(x):\n", + " return 1.0 - x**2" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# 双层的前馈神经网络的BP算法。\n", + "class NeuralNetwork:\n", + " # 默认激活函数为S型函数\n", + " def __init__(self, layers, activation='sigmoid'):\n", + " # 精度设置\n", + " self.error_criterion = []\n", + " # 初始化error数(随机设置一个比criterion大的数),假如小于criterion,即停止迭代\n", + " self.error = 1\n", + " if activation == 'sigmoid':\n", + " self.activation = sigmoid\n", + " self.activation_prime = sigmoid_prime\n", + " elif activation == 'tanh':\n", + " self.activation = tanh\n", + " self.activation_prime = tanh_prime\n", + "\n", + " # Set weights\n", + " self.weights = []\n", + " # Layer 设置\n", + " for i in range(1, len(layers) - 1):\n", + " # 初始权重介于-1,1之间,生成一特征数量*神经元数量的权重矩阵,+1表示常数项对应的权重。\n", + " r = 2*np.random.random((layers[i-1] + 1, layers[i] )) -1\n", + " self.weights.append(r)\n", + " # output layer - random((2+1, 1)) : 3 x 1\n", + " r = 2*np.random.random( (layers[i] + 1, layers[i+1])) - 1\n", + " self.weights.append(r)\n", + "\n", + " def fit(self, X, y, learning_rate=1, epochs=10000,error_criterion=0.01):\n", + " self.error_criterion = error_criterion\n", + " # epochs为迭代次数上限,最终选择在误差小于criterion后或者迭代100000次后,停止。 \n", + " for k in range(epochs):\n", + " if k % 1000 == 0: \n", + " print('epochs:', k)\n", + " \n", + " # 计算mean squared error,若小于criterion即停止更新。\n", + " prediction = self.predict(X)\n", + " mean_square_error = ((y-prediction)**2).mean()\n", + " self.error = mean_square_error\n", + " if mean_square_error < error_criterion:\n", + " break\n", + " # 随机选取元素做更新\n", + " i = np.random.randint(X.shape[0])\n", + " a = [X[i]]\n", + "\n", + " for l in range(len(self.weights)):\n", + " dot_value = np.dot(np.hstack((1,a[l])), self.weights[l])\n", + " activation = self.activation(dot_value)\n", + " a.append(activation)\n", + "\n", + " errors = y[i] - a[-1]\n", + " \n", + " deltas = [errors * self.activation_prime(a[-1])]\n", + " \n", + " #生成倒序delta\n", + " for l in range(len(a) - 2, 0, -1): \n", + " deltas.append(np.dot(self.weights[l][1:],deltas[-1]).dot(self.activation_prime(a[l])))\n", + "\n", + " deltas.reverse()\n", + "\n", + " # 更新权重\n", + " for i in range(len(self.weights)):\n", + " layer = np.atleast_2d(np.hstack((1,a[i])))\n", + " delta = np.atleast_2d(deltas[i])\n", + " self.weights[i] += learning_rate * layer.T.dot(delta)\n", + " \n", + " # 由输入和参数计算神经网络输出值\n", + " def predict(self, X): \n", + " tmp = X\n", + " for l in range(0, len(self.weights)):\n", + " ones = np.ones((tmp.shape[0],1))\n", + " dots =np.dot(np.hstack((ones, tmp)), self.weights[l])\n", + " tmp = self.activation(dots)\n", + " return tmp\n", + " \n", + " def mean_square_error(self,X,y):\n", + " return ((y-self.predict(X))**2).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 第二题\n", + "对XOR运算进行训练,但结果不是很理想.\n", + "即使在迭代9000次,(之前尝试国100000次的情形),最终结果仍然停留在0.5处.不明白原因." + ] + }, + { + "cell_type": "code", + "execution_count": 187, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epochs: 0\n", + "epochs: 1000\n", + "epochs: 2000\n", + "epochs: 3000\n", + "epochs: 4000\n", + "epochs: 5000\n", + "epochs: 6000\n", + "epochs: 7000\n", + "epochs: 8000\n", + "epochs: 9000\n" + ] + } + ], + "source": [ + "# Test for XOR\n", + "nn = NeuralNetwork([2, 3, 1])\n", + "\n", + "X = np.array([[0, 0],[0, 1],[1, 0],[1, 1]])\n", + "\n", + "y = np.array([0, 1, 1, 0])\n", + "\n", + "nn.fit(X, y)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 188, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0.3429196 ],\n", + " [ 0.35613153],\n", + " [ 0.33280655],\n", + " [ 0.33445151]])" + ] + }, + "execution_count": 188, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nn.predict(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 189, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.26937145976643928" + ] + }, + "execution_count": 189, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nn.error" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test for Question 3\n", + "使用2-m-1结构的前馈网络逼近$[0,1]^2$上的连续函数$y=f(x)=1/(1+\\sqrt(x_1 ^2+x_2 ^2))$\n" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# train_fun for numpy array\n", + "def train_fun(x):\n", + " return 1/(1+(x*x).sum(1))" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# 1*1 内的均匀格点\n", + "data=[]\n", + "for i in range(100):\n", + " for j in range(100):\n", + " data.append([i,j])\n", + "data = np.array(data)\n", + "data = 0.005 + data/100" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# 随机选取5000个\n", + "train_loc = rd.choice(10000,5000,replace = False)\n", + "tmp = set(train_loc)\n", + "tmp2 = set(range(10000))\n", + "tmp3 = tmp2.difference_update(tmp)\n", + "training_data = data[train_loc]\n", + "testing_data = data[list(tmp3)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "此处不知道为什么,速度很慢。始终难以收敛到要求。(以及达到了,但没有正常break)" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "epochs: 0\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mtestnn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mNeuralNetwork\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m5\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0my\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtrain_fun\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtraining_data\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mtestnn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtraining_data\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, learning_rate, epochs, error_criterion)\u001b[0m\n\u001b[0;32m 34\u001b[0m \u001b[1;31m# 计算mean squared error,若小于criterion即停止更新。\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[0mprediction\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 36\u001b[1;33m \u001b[0mmean_square_error\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m-\u001b[0m\u001b[0mprediction\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m**\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 37\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0merror\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmean_square_error\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 38\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mmean_square_error\u001b[0m \u001b[1;33m<\u001b[0m \u001b[0merror_criterion\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "testnn = NeuralNetwork([2,5,1])\n", + "y = train_fun(training_data)\n", + "testnn.fit(training_data,y)" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[array([[-0.50325027, -0.80837581, -0.71209436, 0.54072039, 0.15305065],\n", + " [-1.76344064, -1.44920066, -0.89819535, -1.91194376, -1.52794614],\n", + " [-2.01661244, -1.75081518, -1.8705454 , -0.86569561, -1.70634533]]),\n", + " array([[-0.05322174],\n", + " [ 0.52131733],\n", + " [ 0.6899994 ],\n", + " [ 0.60008547],\n", + " [ 0.0413903 ],\n", + " [ 1.7299185 ]])]" + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "testnn.weights" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "此处显示对训练样本的mean squared error已经小于error criterion" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0057749077749055504" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "testnn.error" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0.5693499 ],\n", + " [ 0.60821122],\n", + " [ 0.65583173],\n", + " ..., \n", + " [ 0.57624142],\n", + " [ 0.59523835],\n", + " [ 0.57949126]])" + ] + }, + "execution_count": 153, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "testnn.predict(training_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "predition = testnn.predict(testing_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tmp = predition - train_fun(testing_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "对样本的预测mean squared error远远大于训练样本,显示由过拟合的现象。这也与损失函数中并没有添加正则项有关。" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.030990427107420407" + ] + }, + "execution_count": 181, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp**2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def mean_squared_error()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# 对不同的m值分别测试,计算predicting error并记录。\n", + "error =[]\n", + "for m in range(3,10):\n", + " nn = NeuralNetwork[[2,m,1]]\n", + " nn.fit(training_data,y)\n", + " error.append(nn.mean_sqaured_error(testing_data,train_fun(testing_data)))\n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}