在本练习中,您将实现正则化的线性回归和多项式回归,并使用它来研究具有不同偏差-方差属性的模型
在前半部分的练习中,你将实现正则化线性回归,以预测水库中的水位变化,从而预测大坝流出的水量。在下半部分中,您将通过一些调试学习算法的诊断,并检查偏差 v.s. 方差的影响。
可视化数据
数据集分为:
import numpy as np import matplotlib.pyplot as plt from scipy.io import loadmat import scipy.optimize as opt
''' 1.Prepare dataset ''' data = loadmat('ex5data1.mat') print(data) X, y = data['X'], data['y'] Xval, yval = data['Xval'], data['yval'] Xtest, ytest = data['Xtest'], data['ytest'] # Insert a column of 1's to all of the X's, as usual X = np.insert(X, 0, 1, axis=1) Xval = np.insert(Xval, 0, 1, axis=1) Xtest = np.insert(Xtest, 0, 1, axis=1) print(X.shape, y.shape) # (12, 2) (12, 1) print(Xval.shape, yval.shape) # (21, 2) (21, 1) print(Xtest.shape, ytest.shape) # (21, 2) (21, 1) # Visualizing the data 可视化数据 def plotData(X, y): '''可视化数据''' fig, ax = plt.subplots(figsize=(6, 4)) X = X[:, 1:] ax.scatter(X, y, c='r') ax.set_xlabel('Change in water level (x)') ax.set_ylabel('Water flowing out of the dam (y)') ax.grid(True) # plt.show() pass
J ( θ ) = 1 2 m ( ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 ) + λ 2 m ( ∑ j = 1 n θ j 2 ) J(\theta)=\frac{1}{2 m}\left(\sum_{i=1}^{m}\left(h_{\theta}\left(x^{(i)}\right)-y^{(i)}\right)^{2}\right)+\frac{\lambda}{2 m}\left(\sum_{j=1}^{n} \theta_{j}^{2}\right) J(θ)=2m1(i=1∑m(hθ(x(i))−y(i))2)+2mλ(j=1∑nθj2)
def costReg(theta, X, y, lam): ''' do not regularizethe theta0 theta is a 1-d array with shape (n+1,) X is a matrix with shape (m, n+1) y is a matrix with shape (m, 1) :param theta:weights :param X:输入矩阵 :param y:输出向量 :param lam:lambda :return:costReg ''' cost = np.nansum((X @ theta - y.flatten()) ** 2) reg = lam * theta[1:] @ theta[1:] return (cost + reg) * (1 / (2 * len(X))) theta = np.ones(X.shape[1]) a = costReg(theta, X, y, 1) # 303.9931922202643
∂ J ( θ ) ∂ θ 0 = 1 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) x j ( i ) for j = 0 ∂ J ( θ ) ∂ θ j = ( 1 m ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) x j ( i ) ) + λ m θ j for j ≥ 1 \begin{array}{ll} \frac{\partial J(\theta)}{\partial \theta_{0}}=\frac{1}{m} \sum_{i=1}^{m}\left(h_{\theta}\left(x^{(i)}\right)-y^{(i)}\right) x_{j}^{(i)} & \text { for } j=0 \\ \frac{\partial J(\theta)}{\partial \theta_{j}}=\left(\frac{1}{m} \sum_{i=1}^{m}\left(h_{\theta}\left(x^{(i)}\right)-y^{(i)}\right) x_{j}^{(i)}\right)+\frac{\lambda}{m} \theta_{j} & \text { for } j \geq 1 \end{array} ∂θ0∂J(θ)=m1∑i=1m(hθ(x(i))−y(i))xj(i)∂θj∂J(θ)=(m1∑i=1m(hθ(x(i))−y(i))xj(i))+mλθj for j=0 for j≥1
def gradientReg(theta, X, y, lam): ''' theta: 1-d array with shape (2,) X: 2-d array with shape (12, 2) y: 2-d array with shape (12, 1) l: lambda constant grad has same shape as theta (2,) :param theta:weights :param X:输入矩阵 :param y:输出向量 :param lam:lambda :return:costReg ''' grad = (X @ theta - y.flatten()) @ X reg = lam * theta reg[0] = 0 return (grad + reg) / (len(X)) b = gradientReg(theta, X, y, 1) # [-15.30301567 598.25074417]
def trainLinearReg(X, y, lam): theta = np.zeros(X.shape[1]) res = opt.minimize(fun=costReg, jac=gradientReg, method='TNC', args=(X, y, lam), x0=theta) return res.x fit_theta = trainLinearReg(X, y, 0) # plotData(X, y) # plt.plot(X[:, 1], X @ fit_theta) # plt.show()
这里我们把λ= 0,因为我们现在实现的线性回归只有两个参数,这么低的维度,正则化并没有用。
从图中可以看到,拟合最好的这条直线告诉我们这个模型并不适合这个数据。
在下一节中,您将实现一个函数来生成学习曲线,它可以帮助您调试学习算法,即使可视化数据不那么容易。
机器学习中一个重要的概念是偏差(bias)和方差(variance)的权衡。高偏差意味着欠拟合,高方差意味着过拟合。
在这部分练习中,您将在学习曲线上绘制训练误差和验证误差,以诊断bias-variance问题。
J t r a i n ( θ ) = 1 2 m [ ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 ] J_{\mathrm{train}}(\theta)=\frac{1}{2 m}\left[\sum_{i=1}^{m}\left(h_{\theta}\left(x^{(i)}\right)-y^{(i)}\right)^{2}\right] Jtrain(θ)=2m1[i=1∑m(hθ(x(i))−y(i))2]
训练样本X从1开始逐渐增加,训练出不同的参数向量θ。接着通过交叉验证样本Xval计算验证误差。
使用训练集的子集来训练模型,得到不同的theta。
通过theta计算训练代价和交叉验证代价,切记此时不要使用正则化,将 λ = 0 \lambda = 0λ=0。
计算交叉验证代价时记得整个交叉验证集来计算,无需分为子集。
def plot_learning_curve(X, y, Xval, yval, lam): '''画出学习曲线,即交叉验证误差和训练需查随样本数量的变化的变化''' xx = range(1, len(X) + 1) training_cost, cv_cost = [], [] for i in xx: res = trainLinearReg(X[:i], y[:i], lam) training_cost_i = costReg(res, X[:i], y[:i], lam) cv_cost_i = costReg(res, Xval, yval, lam) training_cost.append(training_cost_i) cv_cost.append(cv_cost_i) pass plt.figure(figsize=(8, 5)) plt.plot(xx, training_cost, label='training cost') plt.plot(xx, cv_cost, label='cross validation cost') plt.legend() plt.xlabel('Number of Training samples') plt.ylabel('Error') plt.title('Learning curve for linear regression') plt.grid(True) plt.show() pass
# plot_learning_curve(X, y, Xval, yval, 0)
从图中看出来,随着样本数量的增加,训练误差和交叉验证误差都很高,这属于高偏差,欠拟合。
数据预处理
# 数据预处理 def genPolyFeatures(X, power): ''' 添加多项式特征 每次在array的最后一列插入第二列的i+2次方(第一列为偏置) 从二次方开始开始插入(因为本身含有一列一次方) :param X: :param power: :return: ''' Xpoly = X.copy() for i in range(2, power+1): Xpoly = np.insert(Xpoly, Xpoly.shape[1], np.power(Xpoly[:, 1], i), axis=1) pass return Xpoly
关于归一化,所有数据集应该都用训练集的均值和样本标准差处理。切记。所以要将训练集的均值和样本标准差存储起来,对后面的数据进行处理。
def get_mean_std(X): '''获取数据的均值和误差,用来标准化所有数据''' means = np.nanmean(X, axis=0) # 每一行的相加起来除以总数 stds = np.nanstd(X, axis=0, ddof=1) # ddof=1 means 样本标准差 return means, stds def featureNormalize(myX, means, stds): '''标准化''' X_norm = myX.copy() X_norm[:, 1:] = X_norm[:, 1:] - means[1:] # 减去相应的位置 X_norm[:, 1:] = X_norm[:, 1:] / stds[1:] # 除相应的位置 return X_norm
而且注意这里是样本标准差而不是总体标准差,使用np.std()时,将ddof=1则是样本标准差,默认=0是总体标准差。而pandas默认计算样本标准差。
获取添加多项式特征以及标准化之后的数据。
power = 6 T = genPolyFeatures(X, power) train_means, train_stds = get_mean_std(genPolyFeatures(X, power)) X_norm = featureNormalize(genPolyFeatures(X, power), train_means, train_stds) Xval_norm = featureNormalize(genPolyFeatures(Xval, power), train_means, train_stds) Xtest_norm = featureNormalize(genPolyFeatures(Xtest, power), train_means, train_stds)
def plot_fit(means, stds, lam, power): '''画出拟合曲线''' theta = trainLinearReg(X_norm, y, lam) x = np.linspace(-75, 55, 50) xmat = x.reshape(-1, 1) # 数组新的shape属性应该要与原来的配套,如果等于-1的话,那么Numpy会根据剩下的维度计算出数组的另外一个shape属性值。 xmat = np.insert(xmat, 0, 1, axis=1) Xmat = genPolyFeatures(xmat, power) Xmat_norm = featureNormalize(Xmat, means, stds) plotData(X, y) plt.plot(x, Xmat_norm @ theta, 'b--') plt.show() pass
# lambda = 0 plot_fit(train_means, train_stds, 0, 6) plot_learning_curve(X_norm, y, Xval_norm, yval, 0)
# lambda = 1 plot_fit(train_means, train_stds, 1, 6) plot_learning_curve(X_norm, y, Xval_norm, yval, 1)
# lambda = 100 plot_fit(train_means, train_stds, 100, 6) plot_learning_curve(X_norm, y, Xval_norm, yval, 100)
惩罚过多,欠拟合状态
lambdas = [0, 0.01, 0.02, 0.04, 0.08, 0.15, 0.32, 0.64, 1.28, 2.56, 3, 5.12, 10] errors_train, errors_val = [], [] for l in lambdas: theta = trainLinearReg(X_norm, y, l) errors_train.append(costReg(theta, X_norm, y, 0)) # 记得把lambda = 0 errors_val.append(costReg(theta, Xval_norm, yval, 0)) pass plt.figure(figsize=(8, 6)) plt.plot(lambdas, errors_train, c='b', label='train') plt.plot(lambdas, errors_val, c='r', label='cv') plt.legend() plt.xlabel('Learning parameters: λ') plt.ylabel('Error') plt.grid(True) # plt.show() # 可以看到时交叉验证代价最小的是 lambda = 2.56 var = lambdas[np.nanargmin(errors_val)] # 2.56 # print(var)
''' 6.Computing test set error ''' theta = trainLinearReg(X_norm, y, 2.56) print('test cost(l={}) = {}'.format(2.56, costReg(theta, Xtest_norm, ytest, 0))) # for l in lambdas: # theta = trainLinearReg(X_norm, y, l) # print('test cost(l={}) = {}'.format(l, costReg(theta, Xtest_norm, ytest, 0
参考链接:https://blog.csdn.net/Cowry5/article/details/80421712