【Python】機械学習で作成した学習モデルをjoblibで出力保存したり読み込む

こんにちは、ミナピピン(@python_mllover)です。

今回はPythonのScikit-learnで学習したモデルを出力して別のファイルで読み込んで、そのモデルに数値を入力して予測を行う手順を紹介したいと思います。

決定木で機械学習モデルを作成してエクスポートする

# https://qiita.com/merry1221/items/ae66a166b86fd1bd8acaより
import requests
import time
import datetime
import traceback
import pandas as pd
import talib as ta
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import optuna
from sklearn.externals import joblib


# ------------価格取得関数（5分足）クリプトウォッチから取得------------
def get_price(after=0, before=0):
    while True:
        try:
            price_data = []
            response = requests.get(
                "https://api.cryptowat.ch/markets/bitflyer/btcfxjpy/ohlc?periods=300&apikey=FBUTE26E60TN7F2NFOGV").json()
            information = response["result"]["300"]
            if information is not None:
                for i in information:
                    price_data.append(
                        {"close_time": datetime.fromtimestamp(i[0]).strftime("%H:%M:%S"), "open_price": i[1],
                         "high_price": i[2], "low_price": i[3], "close_price": i[4]})
            return price_data

        except Exception as e:
            print("Cryptowatchの価格取得でエラー発生 : ", traceback.format_exc())
            print("60秒待機してやり直します")
            time.sleep(60)

# グループ分けの関数


def classify(x):
    # 前足比が-0.2%以下ならグループ０
    if x <= -0.2:
        return 0
# 前足比が0.2%<x<0.2%ならグループ1
    elif -0.2 < x < 0.2:
        return 1
# 前足比が0.2%以上ならグループ２
    elif 0.2 <= x:
        return 2


def objective(trial):
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 16)
    max_leaf_nodes = int(trial.suggest_discrete_uniform(
        "max_leaf_nodes", 4, 64, 4))
    n_estimators = int(trial.suggest_discrete_uniform(
        "n_estimators", 50, 500, 50))
    max_depth = trial.suggest_int("max_depth", 3, 10)
    clf = RandomForestClassifier(random_state=1, n_estimators=n_estimators, max_leaf_nodes=max_leaf_nodes,
                                 max_depth=max_depth, max_features=None, criterion=criterion, min_samples_split=min_samples_split)
    clf.fit(X_train, Y_train)
    return 1 - accuracy_score(Y_test, clf.predict(X_test))


# 価格取得&データフレーム整形
price_data = get_price()
df = pd.DataFrame(price_data)

# テクニカル指標を計算
df['ma5'] = ta.SMA(df["close_price"], timeperiod=5)
df["RSI"] = ta.RSI(df["close_price"], timeperiod=14)
df['macd'], df['macdsignal'], df['macdhist'] = ta.MACD(
    df["close_price"], fastperiod=12, slowperiod=26, signalperiod=9)
# 「変化後÷変化前」 - 前足比
df["前足比"] = df["close_price"].pct_change()*100  # %とするため100をかける
df["前足比_classified"] = df["前足比"].apply(lambda x: classify(x))

# ---教師にしたいデータを一つずつずらす
df_y = df["前足比_classified"].shift()
# ---NUN行を削除
df_xy = df.dropna(how="any")

# データをテスト用と学習用に分割
X_train, X_test, Y_train, Y_test = \
    train_test_split(df_xy[["ma5", "RSI", "macd", "macdsignal"]],
                     df_xy["前足比_classified"], train_size=0.8, random_state=0)

# パラメーター最適化
study = optuna.create_study()
study.optimize(objective, n_trials=100)

print(1-study.best_value)
print(study.best_params)

# 決定したパラメータ
min_samples_split = study.best_params["min_samples_split"]
max_leaf_nodes = int(study.best_params["max_leaf_nodes"])
criterion = study.best_params["criterion"]
n_estimators = int(study.best_params["n_estimators"])
max_depth = study.best_params["max_depth"]

# 上記のパラメータをランダムフォレストのパラメータに代入
clf = RandomForestClassifier(random_state=1,
                             n_estimators=n_estimators,
                             max_leaf_nodes=max_leaf_nodes,
                             max_depth=max_depth,
                             max_features=None,
                             criterion=criterion,
                             min_samples_split=min_samples_split)
# 学習実行
clf.fit(X_train, Y_train)
# 学習結果を保存
joblib.dump(clf, filename='output.clf')

# print(clf.predict(X_test))

参考：https://qiita.com/merry1221/items/ae66a166b86fd1bd8aca

ta-libがインストールエラーになる場合は以下の記事を参考にしてください。

コードの中身はビットコインの5分おきの価格データを取得し、説明変数がテクニカル指標で目的変数が五分後の値動きとなっています。

学習モデルの保存はjoblib.dump()で行えます。引数のfilename=’output.clf’でモデルのファイル名を指定できます。末尾は別になんでも大丈夫です。

生成した機械学習モデルの読み込み

次は生成して出力した機械学習モデルを読み込みましょう。出力した機械学習モデルの読み込みはjoblib.load()で行います。モデルへの当てはめは予測と同じようにclf.predict()で行えます。

from sklearn.externals import joblib
import pandas as pd
import talib as ta
# モデルの読み込み
clf = joblib.load('output.clf')

#移動平均線
df['ma5'] = ta.SMA(df["close_price"],timeperiod=5)
#RSI
df["RSI"] = ta.RSI(df["close_price"], timeperiod=14)
#MACD
df['macd'], df['macdsignal'], df['macdhist'] =ta.MACD(df["close_price"], fastperiod=12, slowperiod=26, signalperiod=9)
#機械学習モデルを使って5分後の変化率を計算
result = clf.predict(df[['ma5','RSI','macd',"macdsignal"]].iloc[-1:])

これでresultに012のどれかが返されます。

参考記事：https://algorithm.joho.info/machine-learning/python-scikit-learn-decision-tree-import/