The Effects of the Learning Rate on Model Performance

5 min readSep 20, 2020

ก่อนจะกล่าวถึง Learning Rate หากไม่พูดถึง Parameter หรือ Hyperparameter คงไม่ได้ โดยที่ Parameter คือตัวแปร(Variable) ที่อยู่ภายใน Model เช่น Weight, Bias ซึ่งจะถูกประมาณค่าโดยอัตโนมัติจาก Dataset ที่ใช้งาน แต่ในขณะที่ Hyperparameter จะเป็นตัวแปรภายนอก Model เช่น Learning Rate, Droupout Rate, Batch Size ซึ่งไม่ได้เกิดจากการประมาณค่าโดย Dataset

Learning Rate เป็นจึงเป็น Hyperparameter ที่สำคัญตัวหนึ่ง ที่มีหน้าที่ในการปรับขนาดของ Error ในแต่ครั้งของการปรับปรุง Weight และ Bias ด้วย Back-propagation Algorithm ดังสมการต่อไปนี้

Update w = w - Learning_Rate*Error_at_w

ซึ่งการปรับเปลี่ยน Learning Rate จะมีผลกระทบกับประสิทธิภาพของ Model เป็นอย่างมาก ถ้าให้เลือกว่าจะปรับจูน Hyperparameter ตัวไหนก่อน Learning Rate คงเป็น Hyperparameter ตัวแรกๆ ที่ควรจะพิจารณาครับ

Impact of Learning Rate

เราจะใช้ Learning Rate ควบคุมความเร็วในการปรับตัวของ Model ต่อปัญหาที่ต้องแก้ไข ซึ่งการกำหนด Learning Rate ขนาดเล็ก จะทำให้การ Train Model แต่ละรอบจะปรับปรุง Weight และ Bias ทีละนิด จึงต้องการจำนวน Epoch หลาย Epoch ขณะที่การกำหนด Learning Rate ขนาดใหญ่ จะทำให้ในการ Train แต่ละรอบจะมีการปรับปรุง Weight และ Bias อย่างรวดเร็ว จึงต้องการจำนวน Epoch ที่น้อยกว่า

ใน Keras Framework สามารถกำหนดค่า Learning Rate เริ่มต้น ผ่านทาง Stochastic Gradient Descent Algorithm แบบต่างๆ อย่างเช่น SGD, AdaGrad (Adaptive Gradient Algorithm), RMSprop (Root Mean Square Propagation) หรือ Adam (Adaptive Moment Estimation) ฯลฯ ซึ่งเราเรียก Algorithm เหล่านี้ว่า Optimizer

ตัวอย่างต่อไปนี้จะใช้ Dataset ที่ชื่อ mnist

Import Library

from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Flatten, Conv2D, MaxPooling2D, Dropout, Dense
from matplotlib import pyplot
from numpy import where
import numpy as np
import pandas as pd
import plotly.express as px
from tensorflow.keras.datasets import mnist
import tensorflow as tf
import time
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras import backend

ทำการ Load Data และเตรียมนำข้อมูลเข้า DataFrame

(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train, X_test = X_train / 255.0, X_test / 255.0
X_train = np.array([X.flatten() for X in X_train])
X_test = np.array([X.flatten() for X in X_test])

นำ Dataset มาแปลงเป็น DataFrame โดยเปลี่ยนชนิดข้อมูลใน Column class เป็น String

X_train_pd = pd.DataFrame(X_train)
y_train_pd = pd.DataFrame(y_train, columns=['class'])df = pd.concat([X_train_pd, y_train_pd], axis=1)
df['class'] = df['class'].astype(str)

เข้ารหัสผลเฉลย แบบ One-Hot Encoder เมื่อ Model มีการ Pridict ว่าเป็น Class ไหน มันจะให้ค่าความมั่นใจ (Confidence) กลับมาด้วยทุกครั้ง

y_train = to_categorical(y_train)y_test = to_categorical(y_test)

นิยาม Model, Complie Model และ Plot Accuracy

def fit_model(trainX, trainy, testX, testy, lrate):
    model = Sequential()model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(50, input_dim=2, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))
    
    opt = SGD(lr=lrate)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=50, verbose=1)pyplot.plot(history.history['accuracy'], label='train')
    pyplot.plot(history.history['val_accuracy'], label='test')
    pyplot.title('lr='+str(lrate), pad=-35)learning_rates = [1E-0, 1E-1, 1E-2, 1E-3, 1E-4, 1E-5, 1E-6, 1E-7]
for i in range(len(learning_rates)):
    print("round: " + str(i) + " of " + str(len(learning_rates)))
    plot_no = 420 + (i+1)
    pyplot.subplot(plot_no)
    fit_model(X_train, y_train, X_test, y_test, learning_rates[i])
pyplot.tight_layout()
pyplot.show()

เมื่อ Train เสร็จแล้ว จะเห็นกราฟ Accuracy หรือ Learning Curve ของ Model

จากภาพ จะเห็นว่า lr = 1.0, กราฟ Accuracy (Accuracy สีฟ้า และ Validate Accuracy สีส้ม) มีการแกว่งขึ้นลงอย่างน่าตกใจ และที่ lr = 1E-6 และ 1E-7, Model มีการเรียนรู้ค่อนข้างต่ำ ขณะที่ที่ lr = 0.1, 0.01 และ 0.001, Model ประสบความสำเร็จในการเรียนรู้มากกว่าที่ lr ค่าอื่นๆ โดยที่ lr = 0.1, Model มีอัตราการเรียนรู้สูงที่สุด คือ มี Accuracy สูงกว่า 80% ตั้งแต่ Epoch ต้นๆ

Momentum

Momentum (β) เป็นเทคนิคในการลดการแกว่งของ Learning Curves พร้อมกับเร่งอัตราการเรียนรู้ของ Model ให้เร็วขึ้น โดยใช้ Velocity (ความเร็ว) ของรอบก่อนหน้า และ Error ในรอบปัจจุบันเพื่อปรับปรุง

นิยาม Model, กำหนด lr = 0.01, Complie Model และ Plot Accuracy

def fit_model(trainX, trainy, testX, testy, momentum):
    model = Sequential()
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(50, input_dim=2, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))opt = SGD(lr=0.01, momentum=momentum)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=50, verbose=1)pyplot.plot(history.history['accuracy'], label='train')
    pyplot.plot(history.history['val_accuracy'], label='test')
    pyplot.title('momentum='+str(momentum), pad=-80)
 
momentums = [0.0, 0.5, 0.9, 0.99]
for i in range(len(momentums)):
    plot_no = 220 + (i+1)
    pyplot.subplot(plot_no)fit_model(X_train, y_train, X_test, y_test, momentums[i])pyplot.tight_layout()
pyplot.show()

จากภาพ จะเห็นว่าที่ momentum (β) = 0.9, Learning Curves ของ Model มีการแกว่งน้อยกว่า รวมทั้งมีอัตราการเรียนรู้ที่เร็วขึ้นกว่าตอนที่ไม่ได้ใช้ momentum อย่างเห็นได้ชัด

Learning Rate Decay

นอกจากนี้ยังสามารถเพิ่มประสิทธิภาพของ Model ได้โดยการค่อย ๆ ลด Learning Rate (Learning Rate Decay) ในแต่ละ Epoch ในอัตราที่เหมาะสม ดังเช่นในตัวอย่าง

นิยาม Model, กำหนด lr = 0.01, Complie Model และ Plot Accuracy

def fit_model(trainX, trainy, testX, testy, decay):
    model = Sequential()
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(units = 50, activation = 'relu',  kernel_initializer='he_uniform'))
    model.add(Dense(units = 10, activation='softmax'))
    opt = SGD(lr=0.01, decay=decay)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=50, verbose=1)
    pyplot.plot(history.history['accuracy'], label='train')
    pyplot.plot(history.history['val_accuracy'], label='test')
    pyplot.title('decay='+str(decay), pad=-80)decay_rates = [1E-1, 1E-2, 1E-3, 1E-4]
for i in range(len(decay_rates)):
    plot_no = 220 + (i+1)
    pyplot.subplot(plot_no)
    fit_model(X_train, y_train, X_test, y_test, decay_rates[i])
pyplot.legend()
pyplot.savefig('decay2.jpeg', dpi=300)
pyplot.show()

จากภาพ จะเห็นว่าที่ decay = 0.001 และ 0.0001, กราฟของ Model ในรอบหลัง มีการแกว่งน้อยกว่ารอบแรกๆ รวมทั้งมีค่า Accuracy ที่สูงกว่าตอนที่ยังไม่ได้ใช้ decay

Drop Learning Rate on Plateau

ในกรณีที่ Loss Value ไม่ลดลงเป็นระยะเวลาหนึ่ง เราจะเรียกสถานการณ์นี้ว่าการเจอที่ราบสูง (Plateau) อาจใช้เทคนิคการปรับลดค่า Learning Rate ด้วยการคูณกับค่า factor เช่น กำหนดให้ factor = 0.1, Learning Rate = 0.01 เมื่อ Loss Value ไม่ลดลง 5 Epoch (patience=5) Learning Rate จะถูกปรับลดลงเป็น 0.001 ((0.1)(0.01)) ตามตัวอย่าง

class LearningRateMonitor(Callback):
    def on_train_begin(self, logs={}):
        self.lrates = list()
 
    def on_epoch_end(self, epoch, logs={}):
        optimizer = self.model.optimizer
        lrate = float(backend.get_value(self.model.optimizer.lr))
        self.lrates.append(lrate)
 
def fit_model(trainX, trainy, testX, testy, patience):
    model = Sequential()
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(50, input_dim=2, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))opt = SGD(lr=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])rlrp = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=patience, min_delta=1E-7)
    lrm = LearningRateMonitor()
    history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=200, verbose=1, callbacks=[rlrp, lrm])
    return lrm.lrates, history.history['loss'], history.history['accuracy']def line_plots(patiences, series, st):
    for i in range(len(patiences)):
        pyplot.subplot(220 + (i+1))
        pyplot.plot(series[i])
        pyplot.title('patience='+str(patiences[i]), pad=-80)
    pyplot.legend()
    pyplot.savefig('patience.jpeg', dpi=300)
    pyplot.show()
 
patiences = [2, 5, 10, 15]
lr_list, loss_list, acc_list, = list(), list(), list()
for i in range(len(patiences)):
    lr, loss, acc = fit_model(X_train, y_train, X_test, y_test, patiences[i])
    lr_list.append(lr)
    loss_list.append(loss)
    acc_list.append(acc)line_plots(patiences, lr_list, 'lr')
line_plots(patiences, loss_list, 'loss')
line_plots(patiences, acc_list, 'acc')

Learning Rate (แกน y) ที่ลดลง ตามเงื่อนไขที่กำหนด (patience = 2, 5, 10, 15)

Loss Value (แกน y) ที่เกิดจากการปรับลด Learning Rate ตามเงื่อนไขที่กำหนด (patience = 2, 5, 10, 15)

Accuracy (แกน y) ที่เกิดจากการปรับลด Learning Rate ตามเงื่อนไขที่กำหนด (patience = 2, 5, 10, 15)

จากภาพ จะเห็นว่าที่ patience = 10, Loss Value ลดลงจนน้อยกว่า 0.4 ขณะที่ Accuracy เพิ่มขึ้นมากกว่า 80%

Adaptive Learning Rates Gradient Descent

ในตอนสุดท้ายเปรียบเทียบ Adaptive Learning Rate Algorithm ได้แก่ AdaGrad (Adaptive Gradient Algorithm), RMSprop (Root Mean Square Propagation) และ Adam (Adaptive Moment Estimation) กับ Optimizer พื้นฐาน (SGD Optimizer)

def fit_model(trainX, trainy, testX, testy, optimizer):
    model = Sequential()
    model.add(Flatten(input_shape=(28,28)))
    model.add(Dense(50, input_dim=2, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=200, verbose=1)pyplot.plot(history.history['accuracy'], label='train')
    pyplot.plot(history.history['val_accuracy'], label='test')
    pyplot.title('opt='+optimizer, pad=-80)momentums = ['sgd', 'rmsprop', 'adagrad', 'adam']
for i in range(len(momentums)):
    plot_no = 220 + (i+1)
    pyplot.subplot(plot_no)fit_model(X_train, y_train, X_test, y_test, momentums[i])pyplot.legend()
pyplot.savefig('adaptive.jpeg', dpi=300)    
pyplot.show()

Reference
https://blog.pjjop.org/learning-rate

Colab
https://colab.research.google.com/drive/1ab579WHxeHnFlVCA4Xn_y1Dg3CWhZglG

บทความนี้เป็นส่วนหนึ่งของรายวิชา AI จัดทำขึ้นเพื่อทดลองทำ Lab ซึ่งหากมีข้อผิดพลาดใดๆ ขออภัยไว้ ณ ที่นี้