16 機械学習をためそう説明

------------- 16-2.txt ------------------------------------ P387

●手書き数字データセットを読み込む


from sklearn import datasets
digits = datasets.load_digits()

●データセットに納められているものを調べる


xxxxxxxxxx
dir(digits)

P388

●digitsデータセットの説明文を読む


xxxxxxxxxx
print(digits.DESCR)

●データの構造を調べる


xxxxxxxxxx
digits.data.shape


xxxxxxxxxx
digits.target.shape

P389

●数字の画像データを確かめる


xxxxxxxxxx
digits.data

●正解の数値が入ったターゲットデータを確かめる


xxxxxxxxxx
digits.target

●1文字目の画像データ


xxxxxxxxxx
digits.data[0]

P390

●1文字目の画像データを8行8列で見てみる


xxxxxxxxxx
digits.images[0]     # 1文字目の画像データ

●画像データから手書き文字を復元する


xxxxxxxxxx
import matplotlib.pyplot as plt
plt.matshow(digits.images[0], cmap="Greys")
plt.show()

●ターゲットデータで正解を調べてみる


xxxxxxxxxx
digits.target[0]

P391

●訓練データとテストデータを用意する


x
n_train = len(digits.data)*2//3 # データの2/3の個数
X_train = digits.data[:n_train]   # dataの前半 2/3
y_train = digits.target[:n_train]   # targetの前半 2/3
X_test = digits.data[n_train:]    # dataの後半 1/3
y_test = digits.target[n_train:]    # targetの後半 1/3

P392

●構造を確認する


x
print([d.shape for d in [X_train, y_train, X_test, y_test]])

●学習器SVMで学習を行う


x
from sklearn import svm       # svmをインポートする
clf = svm.SVC(gamma=0.001)    # 学習器
clf.fit(X_train, y_train)     # 訓練データと教師データで学習する

●テストデータで正答率を調べる


x
print(clf.score(X_test, y_test))

P393

●学習済みモデルが誤って分類した個数を調べる


x
predicted = clf.predict(X_test)
(y_test != predicted).sum()

●学習結果の評価レポート


x
from sklearn import metrics
print(metrics.classification_report(y_test, predicted))

P394

●数字ごとに正解数と読み間違えた数字を調べる


x
print(metrics.confusion_matrix(y_test, predicted))

●画像イメージと分類結果（404～415の12文字を表示）


x
import matplotlib.pyplot as plt
imgs_yt_preds = list(zip(digits.images[n_train:], y_test, predicted))
for index, (image, y_t, pred) in enumerate(imgs_yt_preds[404:416]):
    plt.subplot(3,4,index+1)   # 3X4で表示する
    plt.axis('off')
    plt.tight_layout()
    plt.imshow(image,cmap="Greys",interpolation="nearest")
    plt.title(f'{y_t} pre:{pred}',fontsize=12)
plt.show

p395 完成したコード

●　手書き数字データを機械学習で分類する


xxxxxxxxxx
from sklearn import datasets
from sklearn import svm, metrics
import matplotlib.pyplot as plt
# 手書き数字データセットを読み込む
digits = datasets.load_digits()
X = digits.data    # 手書き数字データ
y = digits.target   # ターゲット
n_train = len(X)*2//3 # データの2/3の個数
# 訓練データ
X_train, y_train = X[:n_train], y[:n_train]   # 前半 2/3
# テストデータ
X_test, y_test = X[n_train:], y[n_train:]    # 後半 1/3
# 学習器の作成と学習
clf = svm.SVC(gamma=0.001)    # 学習器
clf.fit(X_train, y_train)     # 訓練データと教師データで学習する
# モデルの学習結果を評価する
accuracy = clf.score(X_test, y_test)    # テストデータで試す
print(f"正答率{accuracy}")
predicted = clf.predict(X_test)    # テストデータの分類結果
n_error = (y_test != predicted).sum()    # 正解と分類結果を比較する
print(f"誤った個数：{n_error}")
# 詳しいレポート
print("classification report")
print(metrics.classification_report(y_test, predicted))
print("confusion matrix")
print(metrics.confusion_matrix(y_test, predicted))
# 画像イメージと分類結果（404～415の12文字を表示）
imgs_yt_preds = list(zip(digits.images[n_train:], y_test, predicted))
for index, (image, y_t, pred) in enumerate(imgs_yt_preds[404:416]):
    plt.subplot(3, 4, index + 1)    # 3×4で表示する
    plt.axis('off')
    plt.tight_layout() 
    plt.imshow(image, cmap="Greys", interpolation="nearest")
    plt.title(f't:{y_t}  pre:{pred}',  fontsize=12)    # 正解と分類結果
plt.show()

P396

［MEMO］SVCのパラメータを確認する


x
from sklearn import svm
clf = svm.SVC(gamma=0.001)
clf

--------------- 16-3.txt ---------------------------------

P397

●アヤメのデータセットを読み込む


x
from sklearn import datasets
iris = datasets.load_iris()

●irisデータセットに納められているものを調べる


x
dir(iris)

●irisデータセットの説明文を読む print(iris.DESCR)

P398

●データの構造を調べる


xxxxxxxxxx
X = iris.data      # 計測データ
y = iris.target    # ターゲットデータ
X.shape


x
y.shape

●訓練とテストに使う計測データ

x
X

P399

●学習データの属性


x
iris.feature_names

●教師データ（ターゲット）


x
print(y)

●がく片の長さと幅の値で3種類のアヤメをプロットする


x
import matplotlib.pyplot as plt
###### setosa:0～49、versicolor:50～99、virginica:100～149
plt.scatter(X[:50, 0], X[:50, 1], color='r', marker='o', label='setosa')
plt.scatter(X[50:100, 0], X[50:100, 1], color='g', marker='+', label='versicolor')
plt.scatter(X[100:, 0], X[100:, 1],color='b', marker='x', label='virginica')
plt.title("Iris Plants Database")
plt.xlabel('sepal length(cm)')
plt.ylabel('sepal width(cm)')
plt.legend()
plt.show()

P400

●3種類のアヤメをプロットするコード


x
for i, cl, mk, lb in zip([0,1,2], 'rgb', 'o+x', iris.target_names):
    plt.scatter(X[y==i][:,0], X[y==i][:,1], color=cl, marker=mk, label=lb)

●モデルclfを作って訓練データで学習する


xxxxxxxxxx
from sklearn import datasets
from sklearn import svm
iris = datasets.load_iris()   # アヤメのデータセットを読み込む
X = iris.data    # データ
y = iris.target  # ターゲット
n_train = len(X)//2    # データの半分の個数
X_train, X_test = X[:n_train], X[n_train:] # 訓練データ
y_train, y_test = y[:n_train], y[n_train:] # 教師データ
clf = svm.SVC()    # モデルを作る
clf.fit(X_train, y_train)    # 学習する

●テストデータで評価する


x
print(clf.score(X_test, y_test))

●データの前半分を訓練データにする


xxxxxxxxxx
n_train = len(X)//2    # データの半分の個数
X_train, X_test = X[:n_train], X[n_train:] # 訓練データ

P402

●分割比率を設定したShuffleSplitクラスのインスタンスを作る


x
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(train_size=0.6, test_size=0.4, random_state=0)

●訓練データとテストデータのインデックスを作り分割する


train_index, test_index = next(ss.split(X))   # 分割するインデックス番号
X_train, y_train = X[train_index], y[train_index]  # 訓練データ
X_test, y_test = X[test_index], y[test_index]  # テストデータ

●変数の学習器を作って訓練する


x
clf = svm.SVC()    # 学習器を作る
clf.fit(X_train, y_train)    # 訓練する
print(clf.score(X_test, y_test))    # 正答率を調べる

P403

完成したコード ●　shuffleSplitを使って学習データを分割する


x
from sklearn import datasets
from sklearn import svm
from sklearn.model_selection import ShuffleSplit
# アヤメのデータセットを読み込む
iris = datasets.load_iris()
X = iris.data
y = iris.target
# データを分割するインデックスを作る
iris_ss = ShuffleSplit(train_size=0.6, test_size=0.4, random_state=0)
train_index, test_index = next(iris_ss.split(X))
# データを分割する
X_train, y_train = X[train_index], y[train_index]    # 訓練データ
X_test, y_test = X[test_index], y[test_index]    # テストデータ
clf = svm.SVC()    # モデルを作る
clf.fit(X_train, y_train)    # 訓練する
print(clf.score(X_test, y_test))    # 正答率を調べる

●学習器をLogisticRegressionに取り替えて試してみる


x
from sklearn import linear_model
clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

---------------------- 16.4.txt --------------------- P404

●ボストンの住宅価格のデータセットを読み込む


x
from sklearn import datasets
boston = datasets.load_boston()

●データセットに納められているものを調べる


x
dir(boston)

●bostonデータセットの説明文を読む


x
print(boston.DESCR)

P406

●データセットをDataFrame型に変換し出力する


x
from pandas import DataFrame
boston_df = DataFrame(boston.data)    # DataFrame型にする
boston_df.columns = boston.feature_names    # 列名を設定する
boston_df["Price"] = boston.target    # 住宅価格を追加する
print(boston_df[:5])    # 最初の5行だけ

P407

●回帰モデルを作って部屋数と価格の訓練データで訓練する


x
rooms_train = DataFrame(boston_df["RM"])    # 部屋数のデータを抜き出す
y_train = boston.target    # ターゲット（住宅価格）
model = linear_model.LinearRegression()    # 回帰モデルを作る
model.fit(rooms_train, y_train)    # 訓練する

●予想価格をモデルを使って計算する


xxxxxxxxxx
#部屋数のテストデータを作る
import numpy as np
rooms_test = DataFrame(np.arange(rooms_train.values.min(), rooms_train.values.max(), 0.1))
prices_test = model.predict(rooms_test)    # モデルを使って住宅価格を予想する

●実際のデータと回帰直線をグラフ表示する


xxxxxxxxxx
plt.scatter(rooms_train, y_train, c= "b", alpha = 0.5)    # 訓練データ
plt.plot(rooms_test, prices_test, c = "r", marker ="x")    # 回帰直線
plt.title("Boston House Prices dataset")
plt.xlabel("rooms")    # x軸のラベル
plt.ylabel("price $1000's")    # y軸のラベル
plt.show()

p408

完成したコード

　●　部屋数から住宅価格を予想する回帰分析を行う


x
from sklearn import datasets
from sklearn import linear_model
import numpy as np
import matplotlib.pyplot as plt
from pandas import DataFrame
# データセットを読み込む
boston = datasets.load_boston()    # ボストン市の住宅価格と関連データ
boston_df = DataFrame(boston.data)    # DataFrame型にする
boston_df.columns = boston.feature_names    # 列名を設定する
boston_df["Price"] = boston.target    # 住宅価格を追加する
print(boston_df[:10])    # 最初の10行だけ
# 訓練データを作る
rooms_train = DataFrame(boston_df["RM"])    # 部屋数のデータを抜き出す
y_train = boston.target    # ターゲット（住宅価格）
model = linear_model.LinearRegression()    # 回帰モデルを作る
model.fit(rooms_train, y_train)    # 訓練する
# 部屋数のテストデータを作る
rooms_test = DataFrame(np.arange(rooms_train.values.min(), rooms_train.values.max(), 0.1))
prices_test = model.predict(rooms_test)    # モデルを使って住宅価格を予想する
# グラフ表示する（部屋数と住宅価格）
plt.scatter(rooms_train.values.ravel(), y_train, c= "b", alpha = 0.5)    # 訓練データ
plt.plot(rooms_test.values.ravel(), prices_test, c = "r")    # 回帰直線
plt.title("Boston House Prices dataset")
plt.xlabel("rooms")    # x軸のラベル
plt.ylabel("price $1000's")    # y軸のラベル
plt.show()

　　p409

●　seabornモジュールを使って散布図と回帰直線を引く


xxxxxxxxxx
from sklearn import datasets
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
# データセットを読み込む
boston = datasets.load_boston()
boston_df = DataFrame(boston.data)
boston_df.columns = boston.feature_names    # 列名を設定する
boston_df["Price"] = boston.target    # 住宅価格を追加する
# 部屋数と住宅価格から回帰直線を引く
sns.set_style('whitegrid')
sns.lmplot(x = "RM", y = "Price", data = boston_df)
plt.show()