diff --git a/HW01/RUJIAHAO798.jpg b/HW01/RUJIAHAO798.jpg new file mode 100644 index 00000000..08ba953a Binary files /dev/null and b/HW01/RUJIAHAO798.jpg differ diff --git a/HW01/RUJIAHAO798.py b/HW01/RUJIAHAO798.py new file mode 100644 index 00000000..6d763100 --- /dev/null +++ b/HW01/RUJIAHAO798.py @@ -0,0 +1,253 @@ +# Numerical Operations +import csv +import math +import os + +import numpy as np +# Reading/Writing Data +import pandas as pd +# Pytorch +import torch +import torch.nn as nn +from sklearn.feature_selection import SelectFromModel +from sklearn.linear_model import RidgeCV +from torch.utils.data import Dataset, DataLoader, random_split +# For Progress Bar +from tqdm import tqdm + + +# For plotting learning curve +# from torch.utils.tensorboard import SummaryWriter + +def same_seed(seed): + """Fixes random number generator seeds for reproducibility.""" + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + np.random.seed(seed) + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + + +def train_valid_split(data_set, valid_ratio, seed): + """Split provided training data into training set and validation set""" + valid_set_size = int(valid_ratio * len(data_set)) + train_set_size = len(data_set) - valid_set_size + train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], + generator=torch.Generator().manual_seed(seed)) + return np.array(train_set), np.array(valid_set) + + +def predict(test_loader, model, device): + model.eval() # Set your model to evaluation mode. + preds = [] + for x in tqdm(test_loader): + x = x.to(device) + with torch.no_grad(): + pred = model(x) + preds.append(pred.detach().cpu()) + preds = torch.cat(preds, dim=0).numpy() + return preds + + +class COVID19Dataset(Dataset): + """ + x: Features. + y: Targets, if none, do prediction. + """ + + def __init__(self, x, y=None): + if y is None: + self.y = y + else: + self.y = torch.FloatTensor(y) + self.x = torch.FloatTensor(x) + + def __getitem__(self, idx): + if self.y is None: + return self.x[idx] + else: + return self.x[idx], self.y[idx] + + def __len__(self): + return len(self.x) + + +class MyModel(nn.Module): + def __init__(self, input_dim): + super(MyModel, self).__init__() + self.layers = nn.Sequential( + nn.Linear(input_dim, 16), + nn.ReLU(), + nn.Linear(16, 4), + nn.ReLU(), + nn.Linear(4, 2), + nn.ReLU(), + nn.Linear(2, 1) + ) + + def forward(self, x): + x = self.layers(x) + x = x.squeeze(1) # (B, 1) -> (B) + return x + + +def select_feat(train_data, valid_data, test_data, select_all=True): + """Selects useful features to perform regression""" + y_train, y_valid = train_data[:, -1], valid_data[:, -1] + raw_x_train, raw_x_valid, raw_x_test = train_data[:, :-1], valid_data[:, :-1], test_data + + if select_all: + feat_idx = list(range(raw_x_train.shape[1])) + else: + selection = SelectFromModel(RidgeCV()).fit(raw_x_train, y_train) + feat_idx = [i for i, support in enumerate(selection.get_support()) if support] + print('selected feature list: ', feat_idx) + + return raw_x_train[:, feat_idx], raw_x_valid[:, feat_idx], raw_x_test[:, feat_idx], y_train, y_valid + + +def trainer(train_loader, valid_loader, model, config, device): + criterion = nn.MSELoss(reduction='mean') # Define your loss function, do not modify this. + + # Define your optimization algorithm. + # optimizer = torch.optim.SGD(model.parameters(), lr=1e-5, momentum=0.1) + optimizer = torch.optim.Adam(model.parameters(), weight_decay=0.1, lr=1e-3) + + # writer = SummaryWriter() # Writer of tensoboard. + + if not os.path.isdir('./models'): + os.mkdir('./models') # Create directory of saving models. + + n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0 + + for epoch in range(n_epochs): + model.train() # Set your model to train mode. + loss_record = [] + + # tqdm is a package to visualize your training progress. + train_pbar = tqdm(train_loader, position=0, leave=True) + + for x, y in train_pbar: + optimizer.zero_grad() # Set gradient to zero. + x, y = x.to(device), y.to(device) # Move your data to device. + pred = model(x) + loss = criterion(pred, y) + loss.backward() # Compute gradient(backpropagation). + optimizer.step() # Update parameters. + step += 1 + loss_record.append(loss.detach().item()) + + # Display current epoch number and loss on tqdm progress bar. + train_pbar.set_description(f'Epoch [{epoch + 1}/{n_epochs}]') + train_pbar.set_postfix({'loss': loss.detach().item()}) + + mean_train_loss = sum(loss_record) / len(loss_record) + # writer.add_scalar('Loss/train', mean_train_loss, step) + + model.eval() # Set your model to evaluation mode. + loss_record = [] + for x, y in valid_loader: + x, y = x.to(device), y.to(device) + with torch.no_grad(): + pred = model(x) + loss = criterion(pred, y) + + loss_record.append(loss.item()) + + mean_valid_loss = sum(loss_record) / len(loss_record) + print(f'Epoch [{epoch + 1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}') + # writer.add_scalar('Loss/valid', mean_valid_loss, step) + + if mean_valid_loss < best_loss: + best_loss = mean_valid_loss + torch.save(model.state_dict(), config['save_path']) # Save your best model + print('Saving model with loss {:.3f}...'.format(best_loss)) + early_stop_count = 0 + else: + early_stop_count += 1 + + if early_stop_count >= config['early_stop']: + print('\nModel is not improving, so we halt the training session.') + print('Final best loss: ', best_loss) + return best_loss + print('Final best loss: ', best_loss) + return best_loss + + +device = 'cuda' if torch.cuda.is_available() else 'cpu' +config = { + 'seed': 12345, # Your seed number, you can pick your lucky number. :) + 'select_all': False, # Whether to use all features. + 'valid_ratio': 0.2, # validation_size = train_size * valid_ratio + 'n_epochs': 10000, # Number of epochs. + 'batch_size': 512, + 'early_stop': 500, # If model has not improved for this many consecutive epochs, stop training. + 'save_path': './models/model.ckpt' # Your model will be saved here. +} + +# Set seed for reproducibility +same_seed(config['seed']) + +# train_data size: 2699 x 118 (id + 37 states + 16 features x 5 days) +# test_data size: 1078 x 117 (without last day's positive rate) +train_data, test_data = pd.read_csv('./covid.train.csv').values, pd.read_csv('./covid.test.csv').values +train_data, valid_data = train_valid_split(train_data, config['valid_ratio'], config['seed']) + +# Print out the data size. +print(f"""train_data size: {train_data.shape} +valid_data size: {valid_data.shape} +test_data size: {test_data.shape}""") + +# Select features +x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all']) + +# Print out the number of features. +print(f'number of features: {x_train.shape[1]}') + +train_dataset, valid_dataset, test_dataset = COVID19Dataset(x_train, y_train), \ + COVID19Dataset(x_valid, y_valid), \ + COVID19Dataset(x_test) + +# Pytorch data loader loads pytorch dataset into batches. +train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True) +valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True) +test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True) + +if __name__ == '__main__': + """ + Final result: + Score: 0.92776 + Public score: 0.87331 + """ + model = MyModel(input_dim=x_train.shape[1]).to(device) # put your model and data on the same computation device. + trainer(train_loader, valid_loader, model, config, device) + + # best_final_loss = math.inf + best_seed = config['seed'] + # for i in range(1000): + # config['seed'] = i + # model = MyModel(input_dim=x_train.shape[1]).to(device) + # best_loss = trainer(train_loader, valid_loader, model, config, device) + # if best_loss < best_final_loss: + # best_seed = i + + config['seed'] = best_seed + model = MyModel(input_dim=x_train.shape[1]).to(device) + trainer(train_loader, valid_loader, model, config, device) + + + def save_pred(preds, file): + """ Save predictions to specified file """ + with open(file, 'w') as fp: + writer = csv.writer(fp) + writer.writerow(['id', 'tested_positive']) + for i, p in enumerate(preds): + writer.writerow([i, p]) + + + print(config) + model = MyModel(input_dim=x_train.shape[1]).to(device) + model.load_state_dict(torch.load(config['save_path'])) + preds = predict(test_loader, model, device) + save_pred(preds, 'pred.csv')