From 4991db0a364edfc89da1b3c205ab718a1416c041 Mon Sep 17 00:00:00 2001 From: thecml Date: Thu, 27 May 2021 10:05:26 +0200 Subject: [PATCH 01/10] simplified code, made it run on windows --- main.py | 258 ++++++++++++------------------------------- tools/folder2lmdb.py | 49 ++++---- 2 files changed, 96 insertions(+), 211 deletions(-) diff --git a/main.py b/main.py index a8984db..fdc63c6 100644 --- a/main.py +++ b/main.py @@ -13,10 +13,13 @@ import torch.optim import torch.utils.data import torch.utils.data.distributed +import torchvision import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as models +from tools.folder2lmdb import ImageFolderLMDB + model_names = sorted(name for name in models.__dict__ if name.islower() and not name.startswith("__") and callable(models.__dict__[name])) @@ -64,136 +67,58 @@ best_acc1 = 0 +DBS = ['lmdb', 'imagefolder'] +PRINT_STATUS = True def main(): - global args, best_acc1 - args = parser.parse_args() - - if args.seed is not None: - random.seed(args.seed) - torch.manual_seed(args.seed) - cudnn.deterministic = True - warnings.warn('You have chosen to seed training. ' - 'This will turn on the CUDNN deterministic setting, ' - 'which can slow down your training considerably! ' - 'You may see unexpected behavior when restarting ' - 'from checkpoints.') - - if args.gpu is not None: - warnings.warn('You have chosen a specific GPU. This will completely ' - 'disable data parallelism.') - - args.distributed = args.world_size > 1 - - if args.distributed: - dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, - world_size=args.world_size) - - # create model - if args.pretrained: - print("=> using pre-trained model '{}'".format(args.arch)) - model = models.__dict__[args.arch](pretrained=True) - else: - print("=> creating model '{}'".format(args.arch)) - model = models.__dict__[args.arch]() - - if args.gpu is not None: - model = model.cuda(args.gpu) - elif args.distributed: - model.cuda() - model = torch.nn.parallel.DistributedDataParallel(model) - else: - if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): - model.features = torch.nn.DataParallel(model.features) - model.cuda() - else: - model = torch.nn.DataParallel(model).cuda() - + model = models.__dict__['resnet18'](pretrained=True) + model_params = model.parameters() + data_dir = "C:\\Users\\cml\\Downloads\\cats_vs_dogs\\train" + data_db = "C:\\Users\\cml\\Downloads\\cats_vs_dogs\\train.lmdb" + + # send model to gpu + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = model.to(device) + # define loss function (criterion) and optimizer - criterion = nn.CrossEntropyLoss().cuda(args.gpu) - - optimizer = torch.optim.SGD(model.parameters(), args.lr, - momentum=args.momentum, - weight_decay=args.weight_decay) - - # optionally resume from a checkpoint - if args.resume: - if os.path.isfile(args.resume): - print("=> loading checkpoint '{}'".format(args.resume)) - checkpoint = torch.load(args.resume) - args.start_epoch = checkpoint['epoch'] - best_acc1 = checkpoint['best_acc1'] - model.load_state_dict(checkpoint['state_dict']) - optimizer.load_state_dict(checkpoint['optimizer']) - print("=> loaded checkpoint '{}' (epoch {})" - .format(args.resume, checkpoint['epoch'])) - else: - print("=> no checkpoint found at '{}'".format(args.resume)) + criterion = nn.CrossEntropyLoss() + optimizer = torch.optim.SGD(model_params, lr=0.01) - cudnn.benchmark = True - - # Data loading code - traindir = os.path.join(args.data, 'train') - valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - train_dataset = datasets.ImageFolder( - traindir, - transforms.Compose([ - transforms.RandomResizedCrop(224), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - - if args.distributed: - train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) - else: - train_sampler = None - - train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), - num_workers=args.workers, pin_memory=True, sampler=train_sampler) - - val_loader = torch.utils.data.DataLoader( - datasets.ImageFolder(valdir, transforms.Compose([ - transforms.Resize(256), - transforms.CenterCrop(224), - transforms.ToTensor(), - normalize, - ])), - batch_size=args.batch_size, shuffle=False, - num_workers=args.workers, pin_memory=True) - - if args.evaluate: - validate(val_loader, model, criterion) - return - - for epoch in range(args.start_epoch, args.epochs): - if args.distributed: - train_sampler.set_epoch(epoch) - adjust_learning_rate(optimizer, epoch) - - # train for one epoch - train(train_loader, model, criterion, optimizer, epoch) - - # evaluate on validation set - acc1 = validate(val_loader, model, criterion) - - # remember best acc@1 and save checkpoint - is_best = acc1 > best_acc1 - best_acc1 = max(acc1, best_acc1) - save_checkpoint({ - 'epoch': epoch + 1, - 'arch': args.arch, - 'state_dict': model.state_dict(), - 'best_acc1': best_acc1, - 'optimizer' : optimizer.state_dict(), - }, is_best) - - -def train(train_loader, model, criterion, optimizer, epoch): + for dataset_type in DBS: + if dataset_type == 'lmdb': + train_dataset = ImageFolderLMDB( + data_db, + transforms.Compose([ + transforms.RandomResizedCrop(64), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + else: + train_dataset = torchvision.datasets.ImageFolder( + data_dir, + transforms.Compose([ + transforms.RandomResizedCrop(64), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=128, shuffle=True, + num_workers=4, pin_memory=True) + + batch_time, data_time = train(train_loader, model, criterion, optimizer, device) + print(f"Timings for {dataset_type}: ") + print(f"Avg data time: {data_time.avg}") + print(f"Avg batch time: {batch_time.avg}") + print(f"Total data time: {data_time.sum}") + print(f"Total batch time: {batch_time.sum}\n") + +def train(train_loader, model, criterion, optimizer, device, epoch=0): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() @@ -207,14 +132,14 @@ def train(train_loader, model, criterion, optimizer, epoch): for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) - - if args.gpu is not None: - input = input.cuda(args.gpu, non_blocking=True) - target = target.cuda(args.gpu, non_blocking=True) + + # send input + target to gpu + input = input.to(device) + target = target.to(device) # compute output output = model(input) - loss = criterion(output, target) + loss = criterion(output, target.squeeze()) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) @@ -231,68 +156,24 @@ def train(train_loader, model, criterion, optimizer, epoch): batch_time.update(time.time() - end) end = time.time() - if i % args.print_freq == 0: - print('Epoch: [{0}][{1}/{2}]\t' - 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' - 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' - 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' - 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' - 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( - epoch, i, len(train_loader), batch_time=batch_time, - data_time=data_time, loss=losses, top1=top1, top5=top5)) - - -def validate(val_loader, model, criterion): - batch_time = AverageMeter() - losses = AverageMeter() - top1 = AverageMeter() - top5 = AverageMeter() - - # switch to evaluate mode - model.eval() - - with torch.no_grad(): - end = time.time() - for i, (input, target) in enumerate(val_loader): - if args.gpu is not None: - input = input.cuda(args.gpu, non_blocking=True) - target = target.cuda(args.gpu, non_blocking=True) - - # compute output - output = model(input) - loss = criterion(output, target) - - # measure accuracy and record loss - acc1, acc5 = accuracy(output, target, topk=(1, 5)) - losses.update(loss.item(), input.size(0)) - top1.update(acc1[0], input.size(0)) - top5.update(acc5[0], input.size(0)) - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % args.print_freq == 0: - print('Test: [{0}/{1}]\t' - 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' - 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' - 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' - 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( - i, len(val_loader), batch_time=batch_time, loss=losses, - top1=top1, top5=top5)) - - print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' - .format(top1=top1, top5=top5)) - - return top1.avg - + if i % 10 == 0: + if PRINT_STATUS: + print('Epoch: [{0}][{1}/{2}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' + 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( + epoch, i, len(train_loader), batch_time=batch_time, + data_time=data_time, loss=losses, top1=top1, top5=top5)) + + return batch_time, data_time def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): torch.save(state, filename) if is_best: shutil.copyfile(filename, 'model_best.pth.tar') - class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): @@ -310,16 +191,14 @@ def update(self, val, n=1): self.count += n self.avg = self.sum / self.count - def adjust_learning_rate(optimizer, epoch): """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" - lr = args.lr * (0.1 ** (epoch // 30)) + lr = 0.01 * (0.1 ** (epoch // 30)) for param_group in optimizer.param_groups: param_group['lr'] = lr - def accuracy(output, target, topk=(1,)): - """Computes the accuracy over the k top predictions for the specified values of k""" + """Computes the accuracy over the k top predictions for the ecified values of k""" with torch.no_grad(): maxk = max(topk) batch_size = target.size(0) @@ -330,10 +209,9 @@ def accuracy(output, target, topk=(1,)): res = [] for k in topk: - correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) res.append(correct_k.mul_(100.0 / batch_size)) return res - if __name__ == '__main__': main() \ No newline at end of file diff --git a/tools/folder2lmdb.py b/tools/folder2lmdb.py index 65c2af4..44d34c5 100755 --- a/tools/folder2lmdb.py +++ b/tools/folder2lmdb.py @@ -3,13 +3,11 @@ import os, sys import os.path as osp from PIL import Image -import six -import string +import six import lmdb import pickle import msgpack -import tqdm import pyarrow as pa import torch @@ -23,28 +21,29 @@ class ImageFolderLMDB(data.Dataset): def __init__(self, db_path, transform=None, target_transform=None): self.db_path = db_path - self.env = lmdb.open(db_path, subdir=osp.isdir(db_path), - readonly=True, lock=False, - readahead=False, meminit=False) - with self.env.begin(write=False) as txn: - # self.length = txn.stat()['entries'] - 1 - self.length =pa.deserialize(txn.get(b'__len__')) - self.keys= pa.deserialize(txn.get(b'__keys__')) - self.transform = transform self.target_transform = target_transform + def open_lmdb(self): + self.env = lmdb.open(self.db_path, subdir=osp.isdir(self.db_path), + readonly=True, lock=False, + readahead=False, meminit=False) + self.txn = self.env.begin(write=False, buffers=True) + self.length = pa.deserialize(self.txn.get(b'__len__')) + self.keys = pa.deserialize(self.txn.get(b'__keys__')) + def __getitem__(self, index): + if not hasattr(self, 'txn'): + self.open_lmdb() + img, target = None, None - env = self.env - with env.begin(write=False) as txn: - byteflow = txn.get(self.keys[index]) + byteflow = self.txn.get(self.keys[index]) unpacked = pa.deserialize(byteflow) # load image imgbuf = unpacked[0] buf = six.BytesIO() - buf.write(imgbuf) + buf.write(imgbuf[0]) buf.seek(0) img = Image.open(buf).convert('RGB') @@ -60,7 +59,8 @@ def __getitem__(self, index): return img, target def __len__(self): - return self.length + return 25000 + #return self.length def __repr__(self): return self.__class__.__name__ + ' (' + self.db_path + ')' @@ -130,25 +130,29 @@ def dumps_pyarrow(obj): return pa.serialize(obj).to_buffer() -def folder2lmdb(dpath, name="train", write_frequency=5000, num_workers=16): +def folder2lmdb(dpath, name="train_images", write_frequency=5000, num_workers=16): + def collate_fn(x): + return x + 1 + directory = osp.expanduser(osp.join(dpath, name)) print("Loading dataset from %s" % directory) dataset = ImageFolder(directory, loader=raw_reader) - data_loader = DataLoader(dataset, num_workers=num_workers, collate_fn=lambda x: x) + data_loader = DataLoader(dataset, num_workers=num_workers) lmdb_path = osp.join(dpath, "%s.lmdb" % name) isdir = os.path.isdir(lmdb_path) print("Generate LMDB to %s" % lmdb_path) db = lmdb.open(lmdb_path, subdir=isdir, - map_size=1099511627776 * 2, readonly=False, + map_size=30737418240, readonly=False, meminit=False, map_async=True) print(len(dataset), len(data_loader)) txn = db.begin(write=True) - for idx, data in enumerate(data_loader): + for idx, (data, label) in enumerate(data_loader): # print(type(data), data) - image, label = data[0] + image = data + label = label.numpy() txn.put(u'{}'.format(idx).encode('ascii'), dumps_pyarrow((image, label))) if idx % write_frequency == 0: print("[%d/%d]" % (idx, len(data_loader))) @@ -176,5 +180,8 @@ def folder2lmdb(dpath, name="train", write_frequency=5000, num_workers=16): parser.add_argument('-p', '--procs', type=int, default=20) args = parser.parse_args() + + args.folder = "C:\\Users\\cml\\Downloads\\cats_vs_dogs" + args.split = "train" folder2lmdb(args.folder, num_workers=args.procs, name=args.split) From d1aa849e86a01ecdeef210a0f0ecca2fc3474cdf Mon Sep 17 00:00:00 2001 From: thecml Date: Thu, 27 May 2021 11:42:05 +0200 Subject: [PATCH 02/10] made size file, used only pickle --- main.py | 9 ++++++++- tools/folder2lmdb.py | 46 +++++++++++++++++++------------------------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/main.py b/main.py index fdc63c6..1e5cfb2 100644 --- a/main.py +++ b/main.py @@ -4,6 +4,7 @@ import shutil import time import warnings +import os.path as osp import torch import torch.nn as nn @@ -75,7 +76,8 @@ def main(): model_params = model.parameters() data_dir = "C:\\Users\\cml\\Downloads\\cats_vs_dogs\\train" data_db = "C:\\Users\\cml\\Downloads\\cats_vs_dogs\\train.lmdb" - + directory = "C:\\Users\\cml\\Downloads\\cats_vs_dogs" + # send model to gpu device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) @@ -86,11 +88,16 @@ def main(): normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + + # get the size of the db + with open(osp.join(directory, 'LMDB_SIZE'), 'r') as fd: + data_size = int(fd.read()) for dataset_type in DBS: if dataset_type == 'lmdb': train_dataset = ImageFolderLMDB( data_db, + data_size, transforms.Compose([ transforms.RandomResizedCrop(64), transforms.RandomHorizontalFlip(), diff --git a/tools/folder2lmdb.py b/tools/folder2lmdb.py index 44d34c5..228f7d1 100755 --- a/tools/folder2lmdb.py +++ b/tools/folder2lmdb.py @@ -17,10 +17,10 @@ from torchvision.datasets import ImageFolder from torchvision import transforms, datasets - class ImageFolderLMDB(data.Dataset): - def __init__(self, db_path, transform=None, target_transform=None): + def __init__(self, db_path, db_size, transform=None, target_transform=None): self.db_path = db_path + self.db_size = db_size self.transform = transform self.target_transform = target_transform @@ -29,8 +29,8 @@ def open_lmdb(self): readonly=True, lock=False, readahead=False, meminit=False) self.txn = self.env.begin(write=False, buffers=True) - self.length = pa.deserialize(self.txn.get(b'__len__')) - self.keys = pa.deserialize(self.txn.get(b'__keys__')) + self.length = pickle.loads(self.txn.get(b'__len__')) + self.keys = pickle.loads(self.txn.get(b'__keys__')) def __getitem__(self, index): if not hasattr(self, 'txn'): @@ -38,7 +38,7 @@ def __getitem__(self, index): img, target = None, None byteflow = self.txn.get(self.keys[index]) - unpacked = pa.deserialize(byteflow) + unpacked = pickle.loads(byteflow) # load image imgbuf = unpacked[0] @@ -59,13 +59,11 @@ def __getitem__(self, index): return img, target def __len__(self): - return 25000 - #return self.length + return self.db_size def __repr__(self): return self.__class__.__name__ + ' (' + self.db_path + ')' - class ImageFolderLMDB_old(data.Dataset): def __init__(self, db_path, transform=None, target_transform=None): import lmdb @@ -113,27 +111,21 @@ def __len__(self): def __repr__(self): return self.__class__.__name__ + ' (' + self.db_path + ')' - def raw_reader(path): with open(path, 'rb') as f: bin_data = f.read() return bin_data - -def dumps_pyarrow(obj): +def dumps_pickle(obj): """ Serialize an object. - - Returns: - Implementation-dependent bytes-like object + + Returns : + The pickled representation of the object obj as a bytes object """ - return pa.serialize(obj).to_buffer() - + return pickle.dumps(obj) -def folder2lmdb(dpath, name="train_images", write_frequency=5000, num_workers=16): - def collate_fn(x): - return x + 1 - +def folder2lmdb(dpath, name="train_images", write_frequency=5000, num_workers=0): directory = osp.expanduser(osp.join(dpath, name)) print("Loading dataset from %s" % directory) dataset = ImageFolder(directory, loader=raw_reader) @@ -153,7 +145,7 @@ def collate_fn(x): # print(type(data), data) image = data label = label.numpy() - txn.put(u'{}'.format(idx).encode('ascii'), dumps_pyarrow((image, label))) + txn.put(u'{}'.format(idx).encode('ascii'), dumps_pickle((image, label))) if idx % write_frequency == 0: print("[%d/%d]" % (idx, len(data_loader))) txn.commit() @@ -163,21 +155,23 @@ def collate_fn(x): txn.commit() keys = [u'{}'.format(k).encode('ascii') for k in range(idx + 1)] with db.begin(write=True) as txn: - txn.put(b'__keys__', dumps_pyarrow(keys)) - txn.put(b'__len__', dumps_pyarrow(len(keys))) + txn.put(b'__keys__', dumps_pickle(keys)) + txn.put(b'__len__', dumps_pickle(len(keys))) + with open(osp.join(dpath, 'LMDB_SIZE'), 'w') as fd: + fd.write(str(len(keys))) + print("Flushing database ...") db.sync() db.close() - -if __name__ == "__main__": +if __name__=='__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument("-f", "--folder", type=str) parser.add_argument('-s', '--split', type=str, default="val") parser.add_argument('--out', type=str, default=".") - parser.add_argument('-p', '--procs', type=int, default=20) + parser.add_argument('-p', '--procs', type=int, default=0) args = parser.parse_args() From 0b27ce78a1cb09c81f0d793aa9d65c8b2c991b29 Mon Sep 17 00:00:00 2001 From: thecml Date: Thu, 27 May 2021 11:42:58 +0200 Subject: [PATCH 03/10] removed unused deps --- main.py | 8 -------- tools/folder2lmdb.py | 10 +--------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/main.py b/main.py index 1e5cfb2..45390d7 100644 --- a/main.py +++ b/main.py @@ -1,24 +1,16 @@ import argparse -import os -import random import shutil import time -import warnings import os.path as osp - import torch import torch.nn as nn import torch.nn.parallel -import torch.backends.cudnn as cudnn -import torch.distributed as dist import torch.optim import torch.utils.data import torch.utils.data.distributed import torchvision import torchvision.transforms as transforms -import torchvision.datasets as datasets import torchvision.models as models - from tools.folder2lmdb import ImageFolderLMDB model_names = sorted(name for name in models.__dict__ diff --git a/tools/folder2lmdb.py b/tools/folder2lmdb.py index 228f7d1..f4950c4 100755 --- a/tools/folder2lmdb.py +++ b/tools/folder2lmdb.py @@ -1,21 +1,13 @@ import os import os.path as osp -import os, sys -import os.path as osp -from PIL import Image - import six import lmdb import pickle import msgpack -import pyarrow as pa - -import torch import torch.utils.data as data +from PIL import Image from torch.utils.data import DataLoader -from torchvision.transforms import transforms from torchvision.datasets import ImageFolder -from torchvision import transforms, datasets class ImageFolderLMDB(data.Dataset): def __init__(self, db_path, db_size, transform=None, target_transform=None): From 37fb341a79d596390762c0d2f396e6d4fcc149f5 Mon Sep 17 00:00:00 2001 From: thecml Date: Thu, 27 May 2021 11:43:23 +0200 Subject: [PATCH 04/10] removed pyarrow from requirements --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index eca22b4..6e04fd3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ torch torchvision -lmdb -pyarrow +lmdb \ No newline at end of file From c1bc18353e15a048a8cfc1b4a2ebee572c4cbedf Mon Sep 17 00:00:00 2001 From: thecml Date: Thu, 27 May 2021 12:47:22 +0200 Subject: [PATCH 05/10] fixed a description --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 45390d7..b171e24 100644 --- a/main.py +++ b/main.py @@ -197,7 +197,7 @@ def adjust_learning_rate(optimizer, epoch): param_group['lr'] = lr def accuracy(output, target, topk=(1,)): - """Computes the accuracy over the k top predictions for the ecified values of k""" + """Computes the accuracy over the k top predictions for the specified values of k""" with torch.no_grad(): maxk = max(topk) batch_size = target.size(0) From 38d227b0fdfda0e0550115f9e6f71ae562a3a857 Mon Sep 17 00:00:00 2001 From: thecml Date: Fri, 28 May 2021 11:25:22 +0200 Subject: [PATCH 06/10] removed pyarrow, runs on windows --- main.py | 255 ++++++++++++++++++++++++++++++++----------- tools/folder2lmdb.py | 15 +-- 2 files changed, 194 insertions(+), 76 deletions(-) diff --git a/main.py b/main.py index b171e24..3beadf8 100644 --- a/main.py +++ b/main.py @@ -1,17 +1,25 @@ import argparse import shutil import time +import random import os.path as osp import torch import torch.nn as nn import torch.nn.parallel import torch.optim +import torch.backends.cudnn as cudnn +import torch.distributed as dist import torch.utils.data import torch.utils.data.distributed import torchvision import torchvision.transforms as transforms +import torchvision.datasets as datasets import torchvision.models as models from tools.folder2lmdb import ImageFolderLMDB +import lmdb +import pickle +import warnings +import os model_names = sorted(name for name in models.__dict__ if name.islower() and not name.startswith("__") @@ -60,64 +68,132 @@ best_acc1 = 0 -DBS = ['lmdb', 'imagefolder'] -PRINT_STATUS = True def main(): - model = models.__dict__['resnet18'](pretrained=True) - model_params = model.parameters() - data_dir = "C:\\Users\\cml\\Downloads\\cats_vs_dogs\\train" - data_db = "C:\\Users\\cml\\Downloads\\cats_vs_dogs\\train.lmdb" - directory = "C:\\Users\\cml\\Downloads\\cats_vs_dogs" + global args, best_acc1 + args = parser.parse_args() + + if args.seed is not None: + random.seed(args.seed) + torch.manual_seed(args.seed) + cudnn.deterministic = True + warnings.warn('You have chosen to seed training. ' + 'This will turn on the CUDNN deterministic setting, ' + 'which can slow down your training considerably! ' + 'You may see unexpected behavior when restarting ' + 'from checkpoints.') + + if args.gpu is not None: + warnings.warn('You have chosen a specific GPU. This will completely ' + 'disable data parallelism.') + + args.distributed = args.world_size > 1 + + if args.distributed: + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size) - # send model to gpu - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - model = model.to(device) + # create model + if args.pretrained: + print("=> using pre-trained model '{}'".format(args.arch)) + model = models.__dict__[args.arch](pretrained=True) + else: + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + + if args.gpu is not None: + model = model.cuda(args.gpu) + elif args.distributed: + model.cuda() + model = torch.nn.parallel.DistributedDataParallel(model) + else: + if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + model.features = torch.nn.DataParallel(model.features) + model.cuda() + else: + model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer - criterion = nn.CrossEntropyLoss() - optimizer = torch.optim.SGD(model_params, lr=0.01) + criterion = nn.CrossEntropyLoss().cuda(args.gpu) + optimizer = torch.optim.SGD(model.parameters(), args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + checkpoint = torch.load(args.resume) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + + # Data loading code + traindir = os.path.join(args.data, 'train') + valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - # get the size of the db - with open(osp.join(directory, 'LMDB_SIZE'), 'r') as fd: - data_size = int(fd.read()) - - for dataset_type in DBS: - if dataset_type == 'lmdb': - train_dataset = ImageFolderLMDB( - data_db, - data_size, - transforms.Compose([ - transforms.RandomResizedCrop(64), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - else: - train_dataset = torchvision.datasets.ImageFolder( - data_dir, - transforms.Compose([ - transforms.RandomResizedCrop(64), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - normalize, - ])) - - train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=128, shuffle=True, - num_workers=4, pin_memory=True) + train_dataset = datasets.ImageFolder( + traindir, + transforms.Compose([ + transforms.RandomResizedCrop(224), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + normalize, + ])) + + if args.distributed: + train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) + else: + train_sampler = None + + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=args.batch_size, shuffle=False, + num_workers=args.workers, pin_memory=True) + + for epoch in range(args.start_epoch, args.epochs): + if args.distributed: + train_sampler.set_epoch(epoch) + adjust_learning_rate(optimizer, epoch) + + # train for one epoch + train(train_loader, model, criterion, optimizer, epoch) + + # evaluate on validation set + acc1 = validate(val_loader, model, criterion) + + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer' : optimizer.state_dict(), + }, is_best) - batch_time, data_time = train(train_loader, model, criterion, optimizer, device) - print(f"Timings for {dataset_type}: ") - print(f"Avg data time: {data_time.avg}") - print(f"Avg batch time: {batch_time.avg}") - print(f"Total data time: {data_time.sum}") - print(f"Total batch time: {batch_time.sum}\n") -def train(train_loader, model, criterion, optimizer, device, epoch=0): +def train(train_loader, model, criterion, optimizer, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() @@ -131,14 +207,14 @@ def train(train_loader, model, criterion, optimizer, device, epoch=0): for i, (input, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) - - # send input + target to gpu - input = input.to(device) - target = target.to(device) + + if args.gpu is not None: + input = input.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) # compute output output = model(input) - loss = criterion(output, target.squeeze()) + loss = criterion(output, target) # measure accuracy and record loss acc1, acc5 = accuracy(output, target, topk=(1, 5)) @@ -155,24 +231,68 @@ def train(train_loader, model, criterion, optimizer, device, epoch=0): batch_time.update(time.time() - end) end = time.time() - if i % 10 == 0: - if PRINT_STATUS: - print('Epoch: [{0}][{1}/{2}]\t' - 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' - 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' - 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' - 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' - 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( - epoch, i, len(train_loader), batch_time=batch_time, - data_time=data_time, loss=losses, top1=top1, top5=top5)) - - return batch_time, data_time + if i % args.print_freq == 0: + print('Epoch: [{0}][{1}/{2}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' + 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( + epoch, i, len(train_loader), batch_time=batch_time, + data_time=data_time, loss=losses, top1=top1, top5=top5)) + + +def validate(val_loader, model, criterion): + batch_time = AverageMeter() + losses = AverageMeter() + top1 = AverageMeter() + top5 = AverageMeter() + + # switch to evaluate mode + model.eval() + + with torch.no_grad(): + end = time.time() + for i, (input, target) in enumerate(val_loader): + if args.gpu is not None: + input = input.cuda(args.gpu, non_blocking=True) + target = target.cuda(args.gpu, non_blocking=True) + + # compute output + output = model(input) + loss = criterion(output, target) + + # measure accuracy and record loss + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + losses.update(loss.item(), input.size(0)) + top1.update(acc1[0], input.size(0)) + top5.update(acc5[0], input.size(0)) + + # measure elapsed time + batch_time.update(time.time() - end) + end = time.time() + + if i % args.print_freq == 0: + print('Test: [{0}/{1}]\t' + 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' + 'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t' + 'Acc@5 {top5.val:.3f} ({top5.avg:.3f})'.format( + i, len(val_loader), batch_time=batch_time, loss=losses, + top1=top1, top5=top5)) + + print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}' + .format(top1=top1, top5=top5)) + + return top1.avg + def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): torch.save(state, filename) if is_best: shutil.copyfile(filename, 'model_best.pth.tar') + class AverageMeter(object): """Computes and stores the average and current value""" def __init__(self): @@ -190,12 +310,14 @@ def update(self, val, n=1): self.count += n self.avg = self.sum / self.count + def adjust_learning_rate(optimizer, epoch): """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" - lr = 0.01 * (0.1 ** (epoch // 30)) + lr = args.lr * (0.1 ** (epoch // 30)) for param_group in optimizer.param_groups: param_group['lr'] = lr + def accuracy(output, target, topk=(1,)): """Computes the accuracy over the k top predictions for the specified values of k""" with torch.no_grad(): @@ -208,9 +330,10 @@ def accuracy(output, target, topk=(1,)): res = [] for k in topk: - correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) res.append(correct_k.mul_(100.0 / batch_size)) return res + if __name__ == '__main__': main() \ No newline at end of file diff --git a/tools/folder2lmdb.py b/tools/folder2lmdb.py index f4950c4..85a6cfb 100755 --- a/tools/folder2lmdb.py +++ b/tools/folder2lmdb.py @@ -10,9 +10,9 @@ from torchvision.datasets import ImageFolder class ImageFolderLMDB(data.Dataset): - def __init__(self, db_path, db_size, transform=None, target_transform=None): + def __init__(self, db_path, lengeth, transform=None, target_transform=None): self.db_path = db_path - self.db_size = db_size + self.length = lengeth self.transform = transform self.target_transform = target_transform @@ -51,7 +51,7 @@ def __getitem__(self, index): return img, target def __len__(self): - return self.db_size + return self.length def __repr__(self): return self.__class__.__name__ + ' (' + self.db_path + ')' @@ -127,8 +127,9 @@ def folder2lmdb(dpath, name="train_images", write_frequency=5000, num_workers=0) isdir = os.path.isdir(lmdb_path) print("Generate LMDB to %s" % lmdb_path) + map_size = 30737418240 # this should be adjusted based on OS/db size db = lmdb.open(lmdb_path, subdir=isdir, - map_size=30737418240, readonly=False, + map_size=map_size, readonly=False, meminit=False, map_async=True) print(len(dataset), len(data_loader)) @@ -150,9 +151,6 @@ def folder2lmdb(dpath, name="train_images", write_frequency=5000, num_workers=0) txn.put(b'__keys__', dumps_pickle(keys)) txn.put(b'__len__', dumps_pickle(len(keys))) - with open(osp.join(dpath, 'LMDB_SIZE'), 'w') as fd: - fd.write(str(len(keys))) - print("Flushing database ...") db.sync() db.close() @@ -166,8 +164,5 @@ def folder2lmdb(dpath, name="train_images", write_frequency=5000, num_workers=0) parser.add_argument('-p', '--procs', type=int, default=0) args = parser.parse_args() - - args.folder = "C:\\Users\\cml\\Downloads\\cats_vs_dogs" - args.split = "train" folder2lmdb(args.folder, num_workers=args.procs, name=args.split) From 3b037a7b2c2ea820a47c33b87a0e79e994beffc3 Mon Sep 17 00:00:00 2001 From: thecml Date: Fri, 28 May 2021 11:35:44 +0200 Subject: [PATCH 07/10] now opens db in init to set length and keys --- tools/folder2lmdb.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/folder2lmdb.py b/tools/folder2lmdb.py index 85a6cfb..3eaaa6f 100755 --- a/tools/folder2lmdb.py +++ b/tools/folder2lmdb.py @@ -15,14 +15,19 @@ def __init__(self, db_path, lengeth, transform=None, target_transform=None): self.length = lengeth self.transform = transform self.target_transform = target_transform + + env = lmdb.open(self.db_path, subdir=osp.isdir(self.db_path), + readonly=True, lock=False, + readahead=False, meminit=False) + with env.begin(write=False) as txn: + self.length = pickle.loads(txn.get(b'__len__')) + self.keys = pickle.loads(txn.get(b'__keys__')) def open_lmdb(self): self.env = lmdb.open(self.db_path, subdir=osp.isdir(self.db_path), readonly=True, lock=False, readahead=False, meminit=False) self.txn = self.env.begin(write=False, buffers=True) - self.length = pickle.loads(self.txn.get(b'__len__')) - self.keys = pickle.loads(self.txn.get(b'__keys__')) def __getitem__(self, index): if not hasattr(self, 'txn'): From a5457157eb5431d5185f2a1b53a5fd5e778cdc9f Mon Sep 17 00:00:00 2001 From: thecml Date: Fri, 28 May 2021 11:37:17 +0200 Subject: [PATCH 08/10] added evaluate support --- main.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/main.py b/main.py index 3beadf8..58143cc 100644 --- a/main.py +++ b/main.py @@ -170,6 +170,10 @@ def main(): batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) + if args.evaluate: + validate(val_loader, model, criterion) + return + for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) From 45e69c551cb1755a7b2f10d0ecc2afa34ac3be30 Mon Sep 17 00:00:00 2001 From: thecml Date: Fri, 28 May 2021 11:37:59 +0200 Subject: [PATCH 09/10] removed unused imports --- main.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/main.py b/main.py index 58143cc..5003611 100644 --- a/main.py +++ b/main.py @@ -2,7 +2,6 @@ import shutil import time import random -import os.path as osp import torch import torch.nn as nn import torch.nn.parallel @@ -11,13 +10,9 @@ import torch.distributed as dist import torch.utils.data import torch.utils.data.distributed -import torchvision import torchvision.transforms as transforms import torchvision.datasets as datasets import torchvision.models as models -from tools.folder2lmdb import ImageFolderLMDB -import lmdb -import pickle import warnings import os From 1b93630e2322beaed923369c8428c38e2185d582 Mon Sep 17 00:00:00 2001 From: thecml Date: Fri, 28 May 2021 11:45:08 +0200 Subject: [PATCH 10/10] removed length argument --- main.py | 4 ++-- tools/folder2lmdb.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 5003611..81d8bef 100644 --- a/main.py +++ b/main.py @@ -152,8 +152,8 @@ def main(): train_sampler = None train_loader = torch.utils.data.DataLoader( - train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), - num_workers=args.workers, pin_memory=True, sampler=train_sampler) + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder(valdir, transforms.Compose([ diff --git a/tools/folder2lmdb.py b/tools/folder2lmdb.py index 3eaaa6f..f25ed84 100755 --- a/tools/folder2lmdb.py +++ b/tools/folder2lmdb.py @@ -10,9 +10,8 @@ from torchvision.datasets import ImageFolder class ImageFolderLMDB(data.Dataset): - def __init__(self, db_path, lengeth, transform=None, target_transform=None): + def __init__(self, db_path, transform=None, target_transform=None): self.db_path = db_path - self.length = lengeth self.transform = transform self.target_transform = target_transform @@ -122,7 +121,7 @@ def dumps_pickle(obj): """ return pickle.dumps(obj) -def folder2lmdb(dpath, name="train_images", write_frequency=5000, num_workers=0): +def folder2lmdb(dpath, name="train", write_frequency=5000, num_workers=0): directory = osp.expanduser(osp.join(dpath, name)) print("Loading dataset from %s" % directory) dataset = ImageFolder(directory, loader=raw_reader)