1200字范文,内容丰富有趣,写作的好帮手!
1200字范文 > 机器学习基础算法25-SVM实践

机器学习基础算法25-SVM实践

时间:2024-02-04 23:07:27

相关推荐

机器学习基础算法25-SVM实践

文章目录

1.鸢尾花SVM-二特征分类2.SVM多分类方法:One/One or One/Other3. SVM不同参数的分类-不同的分类器(调参)4.不平衡数据的处理5.分类器指标6.SVM同于手写图片识别7.MINIST数字图片识别8.SVR预测9.SVR调参10.SVM的RBF核与过拟合

1.鸢尾花SVM-二特征分类

# 鸢尾花SVM-二特征分类import numpy as npimport pandas as pdimport matplotlib as mplimport matplotlib.pyplot as plt# sklearn中svmfrom sklearn import svmfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import accuracy_score# 'sepal length', 'sepal width', 'petal length', 'petal width'iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'if __name__ == "__main__":path = 'iris.data' # 数据文件路径data = pd.read_csv(path, header=None)# 特征值与目标值x, y = data[range(4)], data[4]# 将字符串数据y转换成categorical类别数据,并映射成0,1,2y = pd.Categorical(y).codes# u'花萼长度', u'花萼宽度'x = x[[0, 1]]# 数据集的分割x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6)# 分类器# decision_function_shape='ovr'表示用若干个二分类转换成三分类clf = svm.SVC(C=0.1, kernel='linear', decision_function_shape='ovr')# clf = svm.SVC(C=0.8, kernel='rbf', gamma=20, decision_function_shape='ovr')clf.fit(x_train, y_train.ravel())# 准确率print(clf.score(x_train, y_train))# 精度print('训练集准确率:', accuracy_score(y_train, clf.predict(x_train)))print(clf.score(x_test, y_test))print('测试集准确率:', accuracy_score(y_test, clf.predict(x_test)))# decision_function# 到分类器的距离,三个值哪一个大就属于哪一个类别,decision_function与predict一一对应print('decision_function:\n', clf.decision_function(x_train))print('\npredict:\n', clf.predict(x_train))# 画图x1_min, x2_min = x.min()x1_max, x2_max = x.max()x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] # 生成网格采样点grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点# print 'grid_test = \n', grid_test# Z = clf.decision_function(grid_test) # 样本到决策面的距离# print Zgrid_hat = clf.predict(grid_test) # 预测分类值grid_hat = grid_hat.reshape(x1.shape) # 使之与输入的形状相同mpl.rcParams['font.sans-serif'] = [u'SimHei']mpl.rcParams['axes.unicode_minus'] = Falsecm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])plt.figure(facecolor='w')plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light)plt.scatter(x[0], x[1], c=y, edgecolors='k', s=50, cmap=cm_dark)# 样本plt.scatter(x_test[0], x_test[1], s=120, facecolors='none', zorder=10)# 圈中测试集样本plt.xlabel(iris_feature[0], fontsize=13)plt.ylabel(iris_feature[1], fontsize=13)plt.xlim(x1_min, x1_max)plt.ylim(x2_min, x2_max)plt.title(u'鸢尾花SVM二特征分类', fontsize=16)plt.grid(b=True, ls=':')plt.tight_layout(pad=1.5)plt.show()

0.8训练集准确率: 0.80.8测试集准确率: 0.8decision_function:[[ 2.23327436 0.86782059 -0.2086884 ][-0.18441377 2.18226918 1.01026605][-0.23484387 1.17885164 2.18363523][ 2.21053693 0.99621883 -0.21001569][ 2.23125191 0.9082 -0.21780963][-0.23548595 1.14446122 2.20715689][ 2.24598673 0.85731286 -0.22467694][-0.19655173 1.12265558 2.15361747][-0.22574542 1.10218037 2.20784054][-0.24414236 1.13665193 2.2237747 ][-0.14561483 2.13979862 1.01688699][-0.23611971 1.09037408 2.2242905 ]...[-0.26802028 1.17342522 2.25039591][-0.1733947 2.1703469 1.0125036 ][-0.23014198 1.16636152 2.18411744][-0.2225925 1.11935467 2.19740337][-0.24306957 1.19895557 2.1826614 ][-0.24414236 1.13665193 2.2237747 ][ 2.20641253 0.96238077 -0.19995171]]predict:[0 1 2 0 0 2 0 2 2 2 1 2 2 0 1 2 2 1 2 1 0 0 0 2 0 1 2 2 0 0 1 0 2 2 2 2 12 2 1 0 1 0 1 2 0 2 0 0 2 1 2 0 0 2 0 1 0 2 0 0 2 0 1 0 1 1 0 0 1 0 2 2 02 1 1 2 1 0 0 1 2 2 1 2 2 2 2 0]

2.SVM多分类方法:One/One or One/Other

import matplotlib as mplimport matplotlib.pyplot as pltimport numpy as npfrom scipy import statsfrom sklearn import svmfrom sklearn.metrics import accuracy_scoredef extend(a, b, r):x = a - bm = (a + b) / 2return m - r * x / 2, m + r * x / 2if __name__ == "__main__":# 造数据np.random.seed(0)N = 20x = np.empty((4 * N, 2))# 均值的位置means = [(-1, 1), (1, 1), (1, -1), (-1, -1)]# 方差# np.array(((2, 1), (1, 2)))旋转sigmas = [np.eye(2), 2 * np.eye(2), np.diag((1, 2)), np.array(((2, 1), (1, 2)))]for i in range(4):# 模型mn = stats.multivariate_normal(means[i], sigmas[i] * 0.3)# 模型中随机采样20个x[i * N:(i + 1) * N, :] = mn.rvs(N)# reshape((-1, 1)将行变成列a = np.array((0, 1, 2, 3)).reshape((-1, 1))# 将a复制N份,并且迭代返回结果y = np.tile(a, N).flatten()print('x=\n', x)print('y=\n', y)# 分类器# decision_function_shape='ovo'表示一对一做一个分类器clf = svm.SVC(C=1, kernel='rbf', gamma=1, decision_function_shape='ovo')# clf = svm.SVC(C=1, kernel='linear', decision_function_shape='ovr')clf.fit(x, y)y_hat = clf.predict(x)acc = accuracy_score(y, y_hat)np.set_printoptions(suppress=True)print(u'预测正确的样本个数:%d,正确率:%.2f%%' % (round(acc * 4 * N), 100 * acc))# decision_function# decision_function_shape='ovo'表示一对一做一个分类器,任取两个有6个分类器print('decision_function = \n', clf.decision_function(x))print('预测值为', y_hat)x1_min, x2_min = np.min(x, axis=0)x1_max, x2_max = np.max(x, axis=0)x1_min, x1_max = extend(x1_min, x1_max, 1.05)x2_min, x2_max = extend(x2_min, x2_max, 1.05)x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]x_test = np.stack((x1.flat, x2.flat), axis=1)y_test = clf.predict(x_test)y_test = y_test.reshape(x1.shape)cm_light = mpl.colors.ListedColormap(['#FF8080', '#A0FFA0', '#6060FF', '#F080F0'])cm_dark = mpl.colors.ListedColormap(['r', 'g', 'b', 'm'])mpl.rcParams['font.sans-serif'] = [u'SimHei']mpl.rcParams['axes.unicode_minus'] = Falseplt.figure(facecolor='w')plt.pcolormesh(x1, x2, y_test, cmap=cm_light)plt.scatter(x[:, 0], x[:, 1], s=40, c=y, cmap=cm_dark, alpha=0.7)plt.xlim((x1_min, x1_max))plt.ylim((x2_min, x2_max))plt.grid(b=True)plt.tight_layout(pad=2.5)plt.title(u'SVM多分类方法:One/One or One/Other', fontsize=18)plt.show()

预测正确的样本个数:69,正确率:86.25%[[ 0.00041835 1.0003916 1.25158694 1.03912429 1.06272462 0.06194409][ 0.73619309 0.99992928 1.00035655 1.01254723 1.0001967 -0.0689392 ][-0.15399317 0.73429473 0.85195011 1.02091686 0.54511927 -0.30394945][ 1.00009974 1.13669234 1.42106961 0.90744317 0.60940832 -0.33678376]...[-0.04075745 -0.7817 -1.13175539 -0.16447164 -1.12270726 -1.13341774][ 0.01171865 0.02596605 -0.99995753 0.02779846 -1.00003287 -1.00026606][-0.570479 -0.40889005 -0.99989226 0.26580403 -0.73799241 -0.99955873]][0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 3 0 0 0 1 0 1 1 0 1 1 1 1 1 1 1 1 0 1 1 21 1 1 2 3 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 3 3 3 3 3 3 3 3 3 3 0 3 3 33 3 3 3 3 3]

3. SVM不同参数的分类-不同的分类器(调参)

# SVM不同参数的分类-不同的分类器(调参)import numpy as npfrom sklearn import svmimport matplotlib as mplimport matplotlib.colorsimport matplotlib.pyplot as pltdef show_accuracy(a, b):acc = a.ravel() == b.ravel()# print '正确率:%.2f%%' % (100*float(acc.sum()) / a.size)if __name__ == "__main__":data = np.loadtxt('bipartition.txt', dtype=np.float, delimiter='\t')x, y = np.split(data, (2, ), axis=1)y = y.ravel()# 分类器clf_param = (('linear', 0.1), ('linear', 0.5), ('linear', 1), ('linear', 2),('rbf', 1, 0.1), ('rbf', 1, 1), ('rbf', 1, 10), ('rbf', 1, 100),('rbf', 5, 0.1), ('rbf', 5, 1), ('rbf', 5, 10), ('rbf', 5, 100))x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围x1, x2 = np.mgrid[x1_min:x1_max:200j, x2_min:x2_max:200j] # 生成网格采样点grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FFA0A0'])cm_dark = mpl.colors.ListedColormap(['g', 'r'])mpl.rcParams['font.sans-serif'] = [u'SimHei']mpl.rcParams['axes.unicode_minus'] = Falseplt.figure(figsize=(14, 10), facecolor='w')for i, param in enumerate(clf_param):clf = svm.SVC(C=param[1], kernel=param[0])if param[0] == 'rbf':clf.gamma = param[2]title = u'高斯核,C=%.1f,$\gamma$ =%.1f' % (param[1], param[2])else:title = u'线性核,C=%.1f' % param[1]clf.fit(x, y)y_hat = clf.predict(x)show_accuracy(y_hat, y) # 准确率# 画图print(title)print('支撑向量的数目:', clf.n_support_)print('支撑向量的系数:', clf.dual_coef_)print('支撑向量:', clf.support_)plt.subplot(3, 4, i+1)grid_hat = clf.predict(grid_test) # 预测分类值grid_hat = grid_hat.reshape(x1.shape) # 使之与输入的形状相同plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light, alpha=0.8)plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=40, cmap=cm_dark)# 样本的显示plt.scatter(x[clf.support_, 0], x[clf.support_, 1], edgecolors='k', facecolors='none', s=100, marker='o') # 支撑向量z = clf.decision_function(grid_test)# print 'z = \n', zprint('clf.decision_function(x) = ', clf.decision_function(x))print('clf.predict(x) = ', clf.predict(x))z = z.reshape(x1.shape)plt.contour(x1, x2, z, colors=list('kbrbk'), linestyles=['--', '--', '-', '--', '--'],linewidths=[1, 0.5, 1.5, 0.5, 1], levels=[-1, -0.5, 0, 0.5, 1])plt.xlim(x1_min, x1_max)plt.ylim(x2_min, x2_max)plt.title(title, fontsize=14)plt.suptitle(u'SVM不同参数的分类', fontsize=20)plt.tight_layout(1.4)plt.subplots_adjust(top=0.92)plt.savefig('1.png')plt.show()

4.不平衡数据的处理

import numpy as npfrom sklearn import svmimport matplotlib.colorsimport matplotlib.pyplot as pltfrom sklearn.metrics import accuracy_score, precision_score, recall_score, f1_scorefrom sklearn.exceptions import UndefinedMetricWarningimport warningsif __name__ == "__main__":warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)np.random.seed(0) # 保持每次生成的数据相同# 造数据c1 = 990c2 = 10N = c1 + c2x_c1 = 3*np.random.randn(c1, 2)x_c2 = 0.5*np.random.randn(c2, 2) + (4, 4)x = np.vstack((x_c1, x_c2))y = np.ones(N)y[:c1] = -1# 显示大小s = np.ones(N) * 30s[:c1] = 10# 分类器clfs = [svm.SVC(C=1, kernel='linear'),svm.SVC(C=1, kernel='linear', class_weight={-1: 1, 1: 50}),svm.SVC(C=0.8, kernel='rbf', gamma=0.5, class_weight={-1: 1, 1: 2}),svm.SVC(C=0.8, kernel='rbf', gamma=0.5, class_weight={-1: 1, 1: 10})]titles = 'Linear', 'Linear, Weight=50', 'RBF, Weight=2', 'RBF, Weight=10'x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] # 生成网格采样点grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点cm_light = matplotlib.colors.ListedColormap(['#77E0A0', '#FF8080'])cm_dark = matplotlib.colors.ListedColormap(['g', 'r'])matplotlib.rcParams['font.sans-serif'] = [u'SimHei']matplotlib.rcParams['axes.unicode_minus'] = Falseplt.figure(figsize=(10, 8), facecolor='w')for i, clf in enumerate(clfs):clf.fit(x, y)y_hat = clf.predict(x)# show_accuracy(y_hat, y) # 正确率# show_recall(y, y_hat) # 召回率print(i+1, '次:')print('accuracy:\t', accuracy_score(y, y_hat))print('precision:\t', precision_score(y, y_hat, pos_label=1))print('recall:\t', recall_score(y, y_hat, pos_label=1))print('F1-score:\t', f1_score(y, y_hat, pos_label=1))print()# 画图plt.subplot(2, 2, i+1)grid_hat = clf.predict(grid_test) # 预测分类值grid_hat = grid_hat.reshape(x1.shape) # 使之与输入的形状相同plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light, alpha=0.8)plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=s, cmap=cm_dark)# 样本的显示plt.xlim(x1_min, x1_max)plt.ylim(x2_min, x2_max)plt.title(titles[i])plt.grid()plt.suptitle(u'不平衡数据的处理', fontsize=18)plt.tight_layout(1.5)plt.subplots_adjust(top=0.92)plt.show()

1 次:accuracy: 0.99precision: 0.0recall: 0.0F1-score: 0.02 次:accuracy: 0.94precision: 0.14285714285714285recall: 1.0F1-score: 0.253 次:accuracy: 0.994precision: 0.625recall: 1.0F1-score: 0.76923076923076934 次:accuracy: 0.994precision: 0.625recall: 1.0F1-score: 0.7692307692307693

5.分类器指标

import numpy as npfrom sklearn.metrics import accuracy_scorefrom sklearn.metrics import precision_score, recall_score, f1_score, fbeta_scorefrom sklearn.metrics import precision_recall_fscore_support, classification_reportif __name__ == "__main__":y_true = np.array([1, 1, 1, 1, 0, 0])y_hat = np.array([1, 0, 1, 1, 1, 1])print('Accuracy:\t', accuracy_score(y_true, y_hat))# The precision is the ratio 'tp / (tp + fp)' where 'tp' is the number of# true positives and 'fp' the number of false positives. The precision is# intuitively the ability of the classifier not to label as positive a sample# that is negative.# The best value is 1 and the worst value is 0.precision = precision_score(y_true, y_hat)print('Precision:\t', precision)# The recall is the ratio 'tp / (tp + fn)' where 'tp' is the number of# true positives and 'fn' the number of false negatives. The recall is# intuitively the ability of the classifier to find all the positive samples.# The best value is 1 and the worst value is 0.recall = recall_score(y_true, y_hat)print('Recall: \t', recall)# F1 score, also known as balanced F-score or F-measure# The F1 score can be interpreted as a weighted average of the precision and# recall, where an F1 score reaches its best value at 1 and worst score at 0.# The relative contribution of precision and recall to the F1 score are# equal. The formula for the F1 score is:#F1 = 2 * (precision * recall) / (precision + recall)print('f1 score: \t', f1_score(y_true, y_hat))print(2 * (precision * recall) / (precision + recall))# The F-beta score is the weighted harmonic mean of precision and recall,# reaching its optimal value at 1 and its worst value at 0.# The 'beta' parameter determines the weight of precision in the combined# score. 'beta < 1' lends more weight to precision, while 'beta > 1'# favors recall ('beta -> 0' considers only precision, 'beta -> inf' only recall).print('F-beta:')for beta in np.logspace(-3, 3, num=7, base=10):fbeta = fbeta_score(y_true, y_hat, beta=beta)print('\tbeta=%9.3f\tF-beta=%.5f' % (beta, fbeta))#print (1+beta**2)*precision*recall / (beta**2 * precision + recall)print(precision_recall_fscore_support(y_true, y_hat, beta=1))print(classification_report(y_true, y_hat))

Accuracy: 0.5Precision: 0.6Recall: 0.75f1 score: 0.66666666666666650.6666666666666665F-beta:beta= 0.001F-beta=0.60000beta= 0.010F-beta=0.60001beta= 0.100F-beta=0.60119beta= 1.000F-beta=0.66667beta= 10.000F-beta=0.74815beta= 100.000F-beta=0.74998beta= 1000.000F-beta=0.75000(array([0. , 0.6]), array([0. , 0.75]), array([0. , 0.66666667]), array([2, 4], dtype=int64))precision recall f1-score support0 0.000.000.00 21 0.600.750.67 4accuracy 0.50 6macro avg 0.300.380.33 6weighted avg 0.400.500.44 6

6.SVM同于手写图片识别

import numpy as npimport pandas as pdfrom sklearn import svmimport matplotlib.colorsimport matplotlib.pyplot as pltfrom PIL import Imagefrom sklearn.metrics import accuracy_scoreimport osfrom sklearn.model_selection import train_test_splitfrom sklearn.model_selection import GridSearchCVfrom time import timedef show_accuracy(a, b, tip):acc = a.ravel() == b.ravel()print(tip + '正确率:%.2f%%' % (100*np.mean(acc)))def save_image(im, i):im *= 15.9375im = 255 - ima = im.astype(np.uint8)output_path = '.\\HandWritten'if not os.path.exists(output_path):os.mkdir(output_path)Image.fromarray(a).save(output_path + ('\\%d.png' % i))if __name__ == "__main__":print('Load Training File Start...')data = np.loadtxt('optdigits.tra', dtype=np.float, delimiter=',')x, y = np.split(data, (-1, ), axis=1)images = x.reshape(-1, 8, 8)y = y.ravel().astype(np.int)print('Load Test Data Start...')data = np.loadtxt('optdigits.tes', dtype=np.float, delimiter=',')x_test, y_test = np.split(data, (-1, ), axis=1)print(y_test.shape)images_test = x_test.reshape(-1, 8, 8)y_test = y_test.ravel().astype(np.int)print('Load Data OK...')# x, x_test, y, y_test = train_test_split(x, y, test_size=0.4, random_state=1)# images = x.reshape(-1, 8, 8)# images_test = x_test.reshape(-1, 8, 8)matplotlib.rcParams['font.sans-serif'] = [u'SimHei']matplotlib.rcParams['axes.unicode_minus'] = Falseplt.figure(figsize=(15, 9), facecolor='w')for index, image in enumerate(images[:16]):plt.subplot(4, 8, index + 1)plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')plt.title(u'训练图片: %i' % y[index])for index, image in enumerate(images_test[:16]):plt.subplot(4, 8, index + 17)plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')save_image(image.copy(), index)plt.title(u'测试图片: %i' % y_test[index])plt.tight_layout()plt.show()# params = {'C':np.logspace(0, 3, 7), 'gamma':np.logspace(-5, 0, 11)}# model = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=params, cv=3)model = svm.SVC(C=10, kernel='rbf', gamma=0.001)print('Start Learning...')t0 = time()model.fit(x, y)t1 = time()t = t1 - t0print('训练+CV耗时:%d分钟%.3f秒' % (int(t/60), t - 60*int(t/60)))# print '最优参数:\t', model.best_params_#clf.fit(x, y)print('Learning is OK...')print('训练集准确率:', accuracy_score(y, model.predict(x)))y_hat = model.predict(x_test)print('测试集准确率:', accuracy_score(y_test, model.predict(x_test)))print(y_hat)print(y_test)err_images = images_test[y_test != y_hat]err_y_hat = y_hat[y_test != y_hat]err_y = y_test[y_test != y_hat]print(err_y_hat)print(err_y)plt.figure(figsize=(10, 8), facecolor='w')for index, image in enumerate(err_images):if index >= 12:breakplt.subplot(3, 4, index + 1)plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')plt.title(u'错分为:%i,真实值:%i' % (err_y_hat[index], err_y[index]))plt.tight_layout()plt.show()

Load Training File Start...Load Test Data Start...(1797, 1)Load Data OK...Start Learning...训练+CV耗时:0分钟0.323秒Learning is OK...训练集准确率: 1.0测试集准确率: 0.9827490261547023[0 1 2 ... 8 9 8][0 1 2 ... 8 9 8][9 1 1 1 1 9 5 9 9 9 9 9 9 8 1 0 1 3 8 9 9 3 5 9 1 7 3 5 8 5 1][5 2 2 2 8 7 7 5 7 7 7 7 7 1 8 6 8 9 9 3 8 8 8 7 8 3 9 9 3 3 8]

7.MINIST数字图片识别

import numpy as npfrom sklearn import svmimport matplotlib.colorsimport matplotlib.pyplot as pltfrom PIL import Imagefrom sklearn.metrics import accuracy_scoreimport pandas as pdimport osimport csvfrom sklearn.model_selection import train_test_splitfrom sklearn.model_selection import GridSearchCVfrom sklearn.ensemble import RandomForestClassifierfrom time import timefrom pprint import pprintdef save_image(im, i):im = 255 - ima = im.astype(np.uint8)output_path = '.\\HandWritten'if not os.path.exists(output_path):os.mkdir(output_path)Image.fromarray(a).save(output_path + ('\\%d.png' % i))def save_result(model):data_test_hat = model.predict(data_test)with open('Prediction.csv', 'wb') as f:writer = csv.writer(f)writer.writerow(['ImageId', 'Label'])for i, d in enumerate(data_test_hat):writer.writerow([i, d])# writer.writerows(zip(np.arange(1, len(data_test_hat) + 1), data_test_hat))if __name__ == "__main__":classifier_type = 'RF'print('载入训练数据...')t = time()data = pd.read_csv('.\\MNIST.train.csv', header=0, dtype=np.int)print('载入完成,耗时%f秒' % (time() - t))y = data['label'].valuesx = data.values[:, 1:]print('图片个数:%d,图片像素数目:%d' % x.shape)images = x.reshape(-1, 28, 28)y = y.ravel()print('载入测试数据...')t = time()data_test = pd.read_csv('.\\MNIST.test.csv', header=0, dtype=np.int)data_test = data_test.valuesimages_test_result = data_test.reshape(-1, 28, 28)print('载入完成,耗时%f秒' % (time() - t))np.random.seed(0)x, x_test, y, y_test = train_test_split(x, y, train_size=0.8, random_state=1)images = x.reshape(-1, 28, 28)images_test = x_test.reshape(-1, 28, 28)print(x.shape, x_test.shape)matplotlib.rcParams['font.sans-serif'] = [u'SimHei']matplotlib.rcParams['axes.unicode_minus'] = Falseplt.figure(figsize=(15, 9), facecolor='w')for index, image in enumerate(images[:16]):plt.subplot(4, 8, index + 1)plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')plt.title(u'训练图片: %i' % y[index])for index, image in enumerate(images_test_result[:16]):plt.subplot(4, 8, index + 17)plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')save_image(image.copy(), index)plt.title(u'测试图片')plt.tight_layout()plt.show()# SVMif classifier_type == 'SVM':# params = {'C':np.logspace(1, 4, 4, base=10), 'gamma':np.logspace(-10, -2, 9, base=10)}# clf = svm.SVC(kernel='rbf')# model = GridSearchCV(clf, param_grid=params, cv=3)model = svm.SVC(C=1000, kernel='rbf', gamma=1e-10)print('SVM开始训练...')t = time()model.fit(x, y)t = time() - tprint('SVM训练结束,耗时%d分钟%.3f秒' % (int(t/60), t - 60*int(t/60)))# print '最优分类器:', model.best_estimator_# print '最优参数:\t', model.best_params_# print 'model.cv_results_ ='# pprint(model.cv_results_)t = time()y_hat = model.predict(x)t = time() - tprint('SVM训练集准确率:%.3f%%,耗时%d分钟%.3f秒' % (accuracy_score(y, y_hat)*100, int(t/60), t - 60*int(t/60)))t = time()y_test_hat = model.predict(x_test)t = time() - tprint ('SVM测试集准确率:%.3f%%,耗时%d分钟%.3f秒' % (accuracy_score(y_test, y_test_hat)*100, int(t/60), t - 60*int(t/60)))save_result(model)elif classifier_type == 'RF':rfc = RandomForestClassifier(100, criterion='gini', min_samples_split=2,min_impurity_split=1e-10, bootstrap=True, oob_score=True)print('随机森林开始训练...')t = time()rfc.fit(x, y)t = time() - tprint('随机森林训练结束,耗时%d分钟%.3f秒' % (int(t/60), t - 60*int(t/60)))print('OOB准确率:%.3f%%' % (rfc.oob_score_*100))t = time()y_hat = rfc.predict(x)t = time() - tprint('随机森林训练集准确率:%.3f%%,预测耗时:%d秒' % (accuracy_score(y, y_hat)*100, t))t = time()y_test_hat = rfc.predict(x_test)t = time() - tprint('随机森林测试集准确率:%.3f%%,预测耗时:%d秒' % (accuracy_score(y_test, y_test_hat)*100, t))err = (y_test != y_test_hat)err_images = images_test[err]err_y_hat = y_test_hat[err]err_y = y_test[err]print(err_y_hat)print(err_y)plt.figure(figsize=(10, 8), facecolor='w')for index, image in enumerate(err_images):if index >= 12:breakplt.subplot(3, 4, index + 1)plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')plt.title(u'错分为:%i,真实值:%i' % (err_y_hat[index], err_y[index]))plt.suptitle(u'数字图片手写体识别:分类器%s' % classifier_type, fontsize=18)plt.tight_layout(rect=(0, 0, 1, 0.95))plt.show()

载入训练数据...载入完成,耗时2.418155秒图片个数:42000,图片像素数目:784载入测试数据...载入完成,耗时1.616158秒(33600, 784) (8400, 784)随机森林开始训练...随机森林训练结束,耗时0分钟19.135秒OOB准确率:95.821%随机森林训练集准确率:100.000%,预测耗时:1秒随机森林测试集准确率:96.512%,预测耗时:0秒[2 6 0 7 8 7 0 7 5 0 2 2 9 3 9 8 2 8 7 0 4 9 3 9 2 8 9 7 4 4 1 5 8 7 5 3 24 0 1 8 7 3 2 6 5 8 4 9 2 7 8 5 2 4 9 9 6 8 2 5 3 9 2 5 1 4 6 2 8 8 0 8 22 1 9 8 4 9 3 3 2 8 7 6 8 9 7 3 5 3 1 2 3 9 9 3 9 8 4 4 7 8 3 3 7 3 4 4 09 9 1 7 4 9 5 2 8 8 3 5 5 8 5 1 3 6 2 7 7 3 6 4 3 4 5 0 7 4 9 5 1 4 3 3 58 7 9 2 0 8 3 3 2 6 5 9 9 9 8 1 3 7 1 5 5 3 4 1 2 9 5 2 8 3 1 4 4 3 2 8 34 4 3 9 2 5 7 1 7 6 8 0 5 9 5 6 5 0 8 8 7 0 6 4 8 7 9 4 3 2 4 4 2 8 6 3 19 2 9 6 2 8 9 5 8 4 4 0 0 2 2 6 9 7 9 4 5 0 6 2 3 6 5 9 9 9 2 5 2 9 9 8 85 4 2 3 9 3 7 9 0 5 0 9 5 3 2 4 3 9 8 8 3 9 5 2 7 9 2 8 5 8 5 4 5 8][4 5 6 4 2 2 5 2 9 7 7 3 7 9 4 3 7 5 3 5 6 4 5 3 7 2 4 4 6 8 4 8 9 8 3 5 30 5 8 2 3 8 9 0 6 9 8 5 3 3 2 9 1 9 8 5 5 3 7 3 5 4 3 6 8 2 4 3 1 3 3 5 34 8 8 2 7 4 9 5 4 9 3 2 3 8 2 8 3 5 8 3 9 7 3 2 7 9 2 8 9 4 9 1 0 9 6 9 94 4 7 9 9 4 8 7 3 5 5 9 3 6 9 8 5 4 7 5 3 5 5 9 9 9 3 2 5 8 4 6 8 9 5 9 03 3 2 7 8 4 9 5 1 4 6 3 4 8 9 5 1 9 7 8 3 1 9 8 3 7 3 7 3 8 3 9 9 1 3 3 29 7 5 7 0 3 2 9 9 8 6 6 8 4 3 4 3 9 2 0 9 5 5 7 9 2 3 9 9 1 9 7 1 9 8 9 84 7 7 0 1 7 7 3 9 3 9 4 9 3 3 2 4 9 4 8 1 5 5 3 8 4 1 7 7 7 3 8 3 7 5 3 03 7 7 2 7 8 4 4 2 9 3 4 8 1 8 9 5 7 5 3 1 7 8 3 9 8 3 2 6 5 6 8 8 3]

8.SVR预测

也可用于时间序列分析

# SVR预测# 也可用于时间序列分析(ARIMA也可用于时间序列分析)import numpy as npfrom sklearn import svmimport matplotlib.pyplot as pltif __name__ == "__main__":# 构造数据N = 50np.random.seed(0)x = np.sort(np.random.uniform(0, 6, N), axis=0)y = 2*np.sin(x) + 0.1*np.random.randn(N)x = x.reshape(-1, 1)print('x =\n', x)print('y =\n', y)# 高斯核函数print('SVR - RBF')svr_rbf = svm.SVR(kernel='rbf', gamma=0.2, C=100)svr_rbf.fit(x, y)# 线性核函数print('SVR - Linear')svr_linear = svm.SVR(kernel='linear', C=100)svr_linear.fit(x, y)# 多项式核函数print('SVR - Polynomial')svr_poly = svm.SVR(kernel='poly', degree=3, C=100)svr_poly.fit(x, y)print('Fit OK.')# 思考:系数1.1改成1.5x_test = np.linspace(x.min(), 1.1*x.max(), 100).reshape(-1, 1)y_rbf = svr_rbf.predict(x_test)y_linear = svr_linear.predict(x_test)y_poly = svr_poly.predict(x_test)plt.figure(figsize=(9, 8), facecolor='w')plt.plot(x_test, y_rbf, 'r-', linewidth=2, label='RBF Kernel')plt.plot(x_test, y_linear, 'g-', linewidth=2, label='Linear Kernel')plt.plot(x_test, y_poly, 'b-', linewidth=2, label='Polynomial Kernel')plt.plot(x, y, 'mo', markersize=6)plt.scatter(x[svr_rbf.support_], y[svr_rbf.support_], s=200, c='r', marker='*', label='RBF Support Vectors', zorder=10)plt.legend(loc='lower left')plt.title('SVR', fontsize=16)plt.xlabel('X')plt.ylabel('Y')plt.grid(True)plt.tight_layout(2)plt.show()

x =[[0.1127388 ][0.12131038][0.36135283][0.42621635]...[5.55357983][5.66248847][5.6680135 ][5.78197656][5.87171005]]y =[ 0.05437325 0.43710367 0.65611482 0.78304981 0.87329469 1.380880421.23598022 1.49456731 1.81603293 2.03841677 1.84627139 1.547977961.63479377 1.53337832 1.22278185 1.15897721 0.92928812 0.950656380.7281 0.69233817 -0.06030957 -0.23617129 -0.23697659 -0.34160192-0.69007014 -0.48527812 -1.00538468 -1.00756566 -0.98948253 -1.05661601-1.17133143 -1.46283398 -1.47415531 -1.61280243 -1.7131299 -1.78692494-1.85631003 -1.98989791 -2.11462751 -1.90906396 -1.95199287 -2.14681169-1.77143442 -1.55815674 -1.48840245 -1.35114367 -1.27027958 -1.04875251-1.00128962 -0.67767925]SVR - RBFSVR - LinearSVR - PolynomialFit OK.

9.SVR调参

import numpy as npfrom sklearn import svmfrom sklearn.model_selection import GridSearchCV # 0.17 grid_searchimport matplotlib.pyplot as pltif __name__ == "__main__":N = 50np.random.seed(0)x = np.sort(np.random.uniform(0, 6, N), axis=0)y = 2*np.sin(x) + 0.1*np.random.randn(N)x = x.reshape(-1, 1)print('x =\n', x)print('y =\n', y)model = svm.SVR(kernel='rbf')c_can = np.logspace(-2, 2, 10)gamma_can = np.logspace(-2, 2, 10)svr = GridSearchCV(model, param_grid={'C': c_can, 'gamma': gamma_can}, cv=5)svr.fit(x, y)print('验证参数:\n', svr.best_params_)x_test = np.linspace(x.min(), x.max(), 100).reshape(-1, 1)y_hat = svr.predict(x_test)sp = svr.best_estimator_.support_plt.figure(facecolor='w')plt.scatter(x[sp], y[sp], s=120, c='r', marker='*', label='Support Vectors', zorder=3)plt.plot(x_test, y_hat, 'r-', linewidth=2, label='RBF Kernel')plt.plot(x, y, 'go', markersize=5)plt.legend(loc='upper right')plt.title('SVR', fontsize=16)plt.xlabel('X')plt.ylabel('Y')plt.grid(True)plt.show()

x =[[0.1127388 ][0.12131038][0.36135283][0.42621635][0.5227758 ]...[5.2289][5.350638 ][5.55357983][5.66248847][5.6680135 ][5.78197656][5.87171005]]y =[ 0.05437325 0.43710367 0.65611482 0.78304981 0.87329469 1.380880421.23598022 1.49456731 1.81603293 2.03841677 1.84627139 1.547977961.63479377 1.53337832 1.22278185 1.15897721 0.92928812 0.950656380.7281 0.69233817 -0.06030957 -0.23617129 -0.23697659 -0.34160192-0.69007014 -0.48527812 -1.00538468 -1.00756566 -0.98948253 -1.05661601-1.17133143 -1.46283398 -1.47415531 -1.61280243 -1.7131299 -1.78692494-1.85631003 -1.98989791 -2.11462751 -1.90906396 -1.95199287 -2.14681169-1.77143442 -1.55815674 -1.48840245 -1.35114367 -1.27027958 -1.04875251-1.00128962 -0.67767925]验证参数:{'C': 35.93813663804626, 'gamma': 0.5994842503189409}

10.SVM的RBF核与过拟合

import numpy as npfrom sklearn import svmimport matplotlib as mplimport matplotlib.colorsimport matplotlib.pyplot as pltdef extend(a, b):big, small = 1.01, 0.01return big*a-small*b, big*b-small*aif __name__ == "__main__":t = np.linspace(-5, 5, 6)t1, t2 = np.meshgrid(t, t)x1 = np.stack((t1.ravel(), t2.ravel()), axis=1)N = len(x1)x2 = x1 + (1, 1)x = np.concatenate((x1, x2))y = np.array([1]*N + [-1]*N)clf = svm.SVC(C=0.1, kernel='rbf', gamma=5)clf.fit(x, y)y_hat = clf.predict(x)print('准确率:%.1f%%' % (np.mean(y_hat == y) * 100))mpl.rcParams['font.sans-serif'] = [u'SimHei']mpl.rcParams['axes.unicode_minus'] = Falsecm_light = mpl.colors.ListedColormap(['#77E0A0', '#FFA0A0'])cm_dark = mpl.colors.ListedColormap(['g', 'r'])x1_min, x1_max = extend(x[:, 0].min(), x[:, 0].max()) # 第0列的范围x2_min, x2_max = extend(x[:, 1].min(), x[:, 1].max()) # 第1列的范围x1, x2 = np.mgrid[x1_min:x1_max:300j, x2_min:x2_max:300j] # 生成网格采样点grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点grid_hat = clf.predict(grid_test)grid_hat.shape = x1.shape # 使之与输入的形状相同plt.figure(facecolor='w')plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light)plt.scatter(x[:, 0], x[:, 1], s=60, c=y, marker='o', cmap=cm_dark)plt.xlim((x1_min, x1_max))plt.ylim((x2_min, x2_max))plt.title(u'SVM的RBF核与过拟合', fontsize=18)plt.tight_layout(0.2)plt.show()

准确率:100.0%

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。