JMANI

Ray Tune Documentation 본문

카테고리 없음

Ray Tune Documentation

jmani 2022. 6. 17. 11:51

ref: https://docs.ray.io/en/latest/tune/index.html

 

Tune: Scalable Hyperparameter Tuning — Ray 1.13.0

Learn how to use Ray Tune for various machine learning frameworks in just a few steps. Click on the tabs to see code examples. Tip We’d love to hear your feedback on using Tune - get in touch! To run this example, install the following: pip install "ray[

docs.ray.io

import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F
import matplotlib.pyplot as plt
import os
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from hyperopt import hp
from ray.tune.suggest.hyperopt import HyperOptSearch

# Change these values if you want the training to run quicker or slower.
EPOCH_SIZE = 512
TEST_SIZE = 256

class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        # In this example, we don't change the model architecture
        # due to simplicity.
        self.conv1 = nn.Conv2d(1, 3, kernel_size=3)
        self.fc = nn.Linear(192, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 3))
        x = x.view(-1, 192)
        x = self.fc(x)
        return F.log_softmax(x, dim=1)


def train(model, optimizer, train_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        # We set this just for the example to run quickly.
        if batch_idx * len(data) > EPOCH_SIZE:
            return
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()


def test(model, data_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(data_loader):
            # We set this just for the example to run quickly.
            if batch_idx * len(data) > TEST_SIZE:
                break
            data, target = data.to(device), target.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    return correct / total


def train_mnist(config):
    # Data Setup
    mnist_transforms = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    train_loader = DataLoader(
        datasets.MNIST("~/data", train=True, download=True, transform=mnist_transforms),
        batch_size=64,
        shuffle=True)
    test_loader = DataLoader(
        datasets.MNIST("~/data", train=False, transform=mnist_transforms),
        batch_size=64,
        shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = ConvNet()
    model.to(device)

    optimizer = optim.SGD(
        model.parameters(), lr=config["lr"], momentum=config["momentum"])

    for i in range(10):
        train(model, optimizer, train_loader)
        acc = test(model, test_loader)

        # Send the current training result back to Tune
        tune.report(mean_accuracy=acc)

        if i % 5 == 0:
            # This saves the model to the trial directory
            torch.save(model.state_dict(), "./model.pth")

# Uncomment this to enable distributed execution
# `ray.init(address="auto")`


"""
Early Stopping with ASHA
search space increasing > parameter <num_samples>

ASHA is implemented in Tune as a “Trial Scheduler”. 
These Trial Schedulers can early terminate bad trials, 
pause trials, clone trials, 
and alter hyperparameters of a running trial.
"""

def ASHA_scheduler(search_space):

    scheduler = \
        ASHAScheduler(metric="mean_accuracy", mode="max")
    # Obtain a trial dataframe from all run trials of this `tune.run` call.
    ASHA_anno = "early stopping with ASHA"
    
    # num_samples: grid search 반복 횟수 default:1
    analysis = tune.run(
        train_mnist,
        num_samples=20,
        scheduler=scheduler,
        config=search_space,
    )
    dfs = analysis.trial_dataframes
    plot(dfs, ASHA_anno)
    bt_model = best_model(analysis)
    return bt_model


def HyperOpt_search_alg(space):
    search_alg = \
        HyperOptSearch(space, metric="mean_accuracy", mode="max")
    # Obtain a trial dataframe from all run trials of this `tune.run` call.

    HyperOpt_anno = "HyperOpt Search Algorithm"
    analysis = tune.run(train_mnist,
                        num_samples=20,
                        search_alg=search_alg)

    # To enable GPUs, use this instead:
    # analysis = tune.run(
    #     train_mnist, config=search_space, resources_per_trial={'gpu': 1})
    dfs = analysis.trial_dataframes
    plot(dfs, HyperOpt_anno)
    bt_model = best_model(analysis)

    return bt_model


def gpu_HyperOpt_search_alg(space):
    # Obtain a trial dataframe from all run trials of this `tune.run` call.

    HyperOpt_anno = "HyperOpt Search Algorithm - GPU version"
    analysis = tune.run(train_mnist,
                        config=space,
                        num_samples=20,
                        resources_per_trial={'cpu': 2, 'gpu': 1})

    # To enable GPUs, use this instead:
    # analysis = tune.run(
    #     train_mnist, , resources_per_trial={'gpu': 1})
    dfs = analysis.trial_dataframes
    plot(dfs, HyperOpt_anno)
    bt_model = best_model(analysis)

    return bt_model


def plot(dfs, name):
    # Plot by epoch
    ax = None  # This plots everything on the same plot
    for d in dfs.values():
        ax = d.mean_accuracy.plot(ax=ax, legend=False)
    ax.set_xlabel("Epochs")
    ax.set_ylabel("Mean Accuracy")

    plt.title(name)
    plt.savefig(os.path.join('img', name + '.jpg'))
    plt.show()


def best_model(analysis):
    df = analysis.results_df
    print(df)
    logdir = analysis.get_best_logdir("mean_accuracy", mode="max")
    state_dict = torch.load(os.path.join(logdir, "model.pth"))
    model = ConvNet()
    model.load_state_dict(state_dict)

    return model


if __name__ == "__main__":
    a = tune.sample_from(lambda spec: 10 ** (-10 * np.random.rand()))
    search_space = {
        "lr": tune.sample_from(lambda spec: 10 ** (-10 * np.random.rand())),
        "momentum": tune.uniform(0.1, 0.9),
    }

    space = {
        "lr": hp.uniform("lr", 1e-10, 0.1),
        "momentum": hp.uniform("momentum", 0.1, 0.9),
    }

    # TrialScheduler
    ASHA_scheduler(search_space)
    
    # Search Algorithm
    HyperOpt_search_alg(space)
    
    # GPU Search Algorithm
    # 속도가 오래걸리는 이유: 1 trials 당 gpu 1개를 할당하기 때문
    gpu_HyperOpt_search_alg(search_space)

 

  • num_samples

search space를 num배 만큼 증가시킬 수 있다. 아래 예에서 3X3 grid_search를 num_samples(10)번 반복하기 때문에 총 90번의 시도(검색)를 하게 된다.

ref: https://docs.ray.io/en/latest/tune/tutorials/tune-search-spaces.html

 tune.run(
     my_trainable,
     name="my_trainable",
     # num_samples will repeat the entire config 10 times.
     num_samples=10
     config={
         # ``sample_from`` creates a generator to call the lambda once per trial.
         "alpha": tune.sample_from(lambda spec: np.random.uniform(100)),
         # ``sample_from`` also supports "conditional search spaces"
         "beta": tune.sample_from(lambda spec: spec.config.alpha * np.random.normal()),
         "nn_layers": [

             # tune.grid_search will make it so that all values are evaluated.

             tune.grid_search([16, 64, 256]),
             tune.grid_search([16, 64, 256]),
         ],
     },
 )

 

  • GPU 사용 및 자원 할당

gpu 사용을 위해 tune.run()의 resources_per_trial 파라미터를 설정한다. 1 시도 당 자원을 얼마나 할당할지 정하는 파라미터이다. 만약 내가 24개의 CPU, 2개의 GPU를 사용할 수 있다면, {'cpu': 2, 'gpu': 1}은 3개씩 시도할 수 있다.

GPU를 사용하기 때문에 속도가 더 빨라질 것을 예상했는데, gpu 사용이 훨씬, 몇배 느리다.

auto mode에서는 한번에 20개 이상의 cpu로 작업하지만, 자원을 설정해버리면 정해논 개수만큼만 사용하기 때문이다. 즉, 다른 작업이 끝날때까지 기다린다.  

# auto
analysis = tune.run(train_mnist,
                    num_samples=20,
                    search_alg=search_alg)
# GPU                    
analysis = tune.run(train_mnist,
                        config=space,
                        num_samples=20,
                        resources_per_trial={'cpu': 2, 'gpu': 1})

또한, GPU 사용을 위해 search_space에 tune.sample_from(함수 호출 형태)로 바꿔줘야 한다.(docs에 나와있지만 이유는 모름)

search_space = {
    "lr": tune.sample_from(lambda spec: 10 ** (-10 * np.random.rand())),
    "momentum": tune.uniform(0.1, 0.9),
}

20번 돌려지긴했지만 1번 만에 stop된 것들이 많아 4개의 선만 보임
hp.loguniform > hp.uniform으로 변경(docs에는 loguniform으로 나와있는데 성능이 매우 안좋다)
왜 성능이 더 안좋을까? tune.run(search_alg=)안 파라미터들의 차이로 보인다

 

 

Comments