diff --git a/nasbench/controller.py b/nasbench/controller.py index 74e75d4..f93527c 100644 --- a/nasbench/controller.py +++ b/nasbench/controller.py @@ -51,15 +51,15 @@ def flatten_parameters(self): self.encoder.rnn.flatten_parameters() self.decoder.rnn.flatten_parameters() - def forward(self, input_variable, target_variable=None): - encoder_outputs, encoder_hidden, arch_emb, predict_value = self.encoder(input_variable) + def forward(self, input_variable, input_len, target_variable=None): + encoder_outputs, encoder_hidden, arch_emb, predict_value = self.encoder(input_variable, input_len) decoder_hidden = (arch_emb.unsqueeze(0), arch_emb.unsqueeze(0)) - decoder_outputs, archs = self.decoder(target_variable, decoder_hidden, encoder_outputs) + decoder_outputs, archs = self.decoder(target_variable, input_len, decoder_hidden, encoder_outputs) return predict_value, decoder_outputs, archs - def generate_new_arch(self, input_variable, predict_lambda=1, direction='-'): + def generate_new_arch(self, input_variable, input_len, predict_lambda=1, direction='-'): encoder_outputs, encoder_hidden, arch_emb, predict_value, new_encoder_outputs, new_arch_emb, new_predict_value = self.encoder.infer( - input_variable, predict_lambda, direction=direction) + input_variable, input_len, predict_lambda, direction=direction) new_encoder_hidden = (new_arch_emb.unsqueeze(0), new_arch_emb.unsqueeze(0)) - decoder_outputs, new_archs = self.decoder(None, new_encoder_hidden, new_encoder_outputs) + decoder_outputs, new_archs = self.decoder(None, input_len, new_encoder_hidden, new_encoder_outputs) return new_archs, new_predict_value diff --git a/nasbench/decoder.py b/nasbench/decoder.py index e2a7e92..79f379d 100644 --- a/nasbench/decoder.py +++ b/nasbench/decoder.py @@ -4,6 +4,8 @@ import torch import torch.nn as nn import torch.nn.functional as F +import utils +from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence SOS_ID = 0 EOS_ID = 0 @@ -73,7 +75,9 @@ def __init__(self, for i in range(self.n): self.offsets.append( (i + 3) * i // 2 - 1) - def forward(self, x, encoder_hidden=None, encoder_outputs=None): + def forward(self, x, x_len, encoder_hidden=None, encoder_outputs=None): + # x is decoder_inputs = [0] + encoder_inputs[:-1] + decoder_hidden = self._init_state(encoder_hidden) if x is not None: bsz = x.size(0) @@ -81,17 +85,30 @@ def forward(self, x, encoder_hidden=None, encoder_outputs=None): x = self.embedding(x) x = F.dropout(x, self.dropout, training=self.training) residual = x + + x = pack_padded_sequence(x, x_len, batch_first=True) x, hidden = self.rnn(x, decoder_hidden) + x = pad_packed_sequence(x, batch_first=True)[0] + x = (residual + x) * math.sqrt(0.5) residual = x - x, _ = self.attention(x, encoder_outputs) + + # create mask + mask = torch.zeros(bsz, x.size(1)) + for i,l in enumerate(x_len): + for j in range(l): + mask[i][j] = 1 + mask = (mask == 0).unsqueeze(1) + mask = utils.move_to_cuda(mask) + + x, _ = self.attention(x, encoder_outputs, mask=mask) x = (residual + x) * math.sqrt(0.5) predicted_softmax = F.log_softmax(self.out(x.view(-1, self.hidden_size)), dim=-1) predicted_softmax = predicted_softmax.view(bsz, tgt_len, -1) return predicted_softmax, None - # inference + # inference : not using xlen. pad packed. assert x is None bsz = encoder_hidden[0].size(1) length = self.length diff --git a/nasbench/encoder.py b/nasbench/encoder.py index 2f19253..350bc9b 100644 --- a/nasbench/encoder.py +++ b/nasbench/encoder.py @@ -4,9 +4,11 @@ import logging import math +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F +from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence class Encoder(nn.Module): @@ -53,11 +55,15 @@ def forward_predictor(self, x): predict_value = torch.sigmoid(x) return predict_value - def forward(self, x): + def forward(self, x, x_len): x = self.embedding(x) x = F.dropout(x, self.dropout, training=self.training) residual = x + + x = pack_padded_sequence(x, x_len, batch_first=True) x, hidden = self.rnn(x) + x = pad_packed_sequence(x, batch_first=True)[0] + x = self.out_proj(x) x = residual + x x = F.normalize(x, 2, dim=-1) @@ -79,8 +85,8 @@ def forward(self, x): predict_value = torch.sigmoid(x) return encoder_outputs, encoder_hidden, arch_emb, predict_value - def infer(self, x, predict_lambda, direction='-'): - encoder_outputs, encoder_hidden, arch_emb, predict_value = self(x) + def infer(self, x, x_len, predict_lambda, direction='-'): + encoder_outputs, encoder_hidden, arch_emb, predict_value = self(x, x_len) grads_on_outputs = torch.autograd.grad(predict_value, encoder_outputs, torch.ones_like(predict_value))[0] if direction == '+': new_encoder_outputs = encoder_outputs + predict_lambda * grads_on_outputs diff --git a/nasbench/runs/train_seminas.sh b/nasbench/runs/train_seminas.sh index 4ccb8cf..e21f273 100644 --- a/nasbench/runs/train_seminas.sh +++ b/nasbench/runs/train_seminas.sh @@ -2,9 +2,7 @@ cd .. export PYTHONPATH=.:$PYTHONPATH MODEL=seminas OUTPUT_DIR=outputs/$MODEL - +DATASET_DIR=home/dzzp/workspace/dataset/ mkdir -p $OUTPUT_DIR -python train_seminas.py \ - --output_dir=$OUTPUT_DIR \ - | tee $OUTPUT_DIR/log.txt +CUDA_VISIBLE_DEVICES=1 python3 train_seminas.py --data=$DATASET_DIR --output_dir=$OUTPUT_DIR | tee $OUTPUT_DIR/log.txt diff --git a/nasbench/train_seminas.py b/nasbench/train_seminas.py index 6fc6dbb..b4e4864 100644 --- a/nasbench/train_seminas.py +++ b/nasbench/train_seminas.py @@ -61,13 +61,28 @@ def controller_train(train_queue, model, optimizer): nll = utils.AvgrageMeter() model.train() for step, sample in enumerate(train_queue): - encoder_input = utils.move_to_cuda(sample['encoder_input']) - encoder_target = utils.move_to_cuda(sample['encoder_target']) - decoder_input = utils.move_to_cuda(sample['decoder_input']) - decoder_target = utils.move_to_cuda(sample['decoder_target']) + encoder_input_unsorted = sample['encoder_input'].long() # shape maybe (batch size, max seq length, word length) + encoder_target_unsorted = sample['encoder_target'].float() + decoder_input_unsorted = sample['decoder_input'].long() + decoder_target_unsorted = sample['decoder_target'].long() + input_len_unsorted = sample['input_len'] + + # sort input batch + input_len, sort_index = torch.sort(input_len_unsorted, 0, descending=True) + input_len = input_len.numpy().tolist() + encoder_input = torch.index_select(encoder_input_unsorted, 0, sort_index) + encoder_target = torch.index_select(encoder_target_unsorted, 0, sort_index) + decoder_input = torch.index_select(decoder_input_unsorted, 0, sort_index) + decoder_target = torch.index_select(decoder_target_unsorted, 0, sort_index) + + # move to cuda + encoder_input = utils.move_to_cuda(encoder_input) # shape maybe (batch size, max seq length, word length) + encoder_target = utils.move_to_cuda(encoder_target) + decoder_input = utils.move_to_cuda(decoder_input) + decoder_target = utils.move_to_cuda(decoder_target) optimizer.zero_grad() - predict_value, log_prob, arch = model(encoder_input, decoder_input) + predict_value, log_prob, arch = model(encoder_input, input_len, decoder_input) loss_1 = F.mse_loss(predict_value.squeeze(), encoder_target.squeeze()) loss_2 = F.nll_loss(log_prob.contiguous().view(-1, log_prob.size(-1)), decoder_target.view(-1)) loss = args.trade_off * loss_1 + (1 - args.trade_off) * loss_2 @@ -88,9 +103,17 @@ def controller_infer(queue, model, step, direction='+'): new_predict_values = [] model.eval() for i, sample in enumerate(queue): - encoder_input = utils.move_to_cuda(sample['encoder_input']) + encoder_input_unsorted = sample['encoder_input'].long() # shape maybe (batch size, max seq length, word length) + input_len_unsorted = sample['input_len'] + # sort input batch + input_len, sort_index = torch.sort(input_len_unsorted, 0, descending=True) + input_len = input_len.numpy().tolist() + encoder_input = torch.index_select(encoder_input_unsorted, 0, sort_index) + # move to gpu + encoder_input = utils.move_to_cuda(encoder_input) + model.zero_grad() - new_arch, new_predict_value = model.generate_new_arch(encoder_input, step, direction=direction) + new_arch, new_predict_value = model.generate_new_arch(encoder_input, input_len, step, direction=direction) new_arch_list.extend(new_arch.data.squeeze().tolist()) new_predict_values.extend(new_predict_value.data.squeeze().tolist()) return new_arch_list, new_predict_values @@ -122,8 +145,16 @@ def generate_synthetic_controller_data(nasbench, model, base_arch=None, random_a with torch.no_grad(): model.eval() for sample in controller_synthetic_queue: - encoder_input = sample['encoder_input'].cuda() - _, _, _, predict_value = model.encoder(encoder_input) + encoder_input_unsorted = sample['encoder_input'].long() # shape maybe (batch size, max seq length, word length) + input_len_unsorted = sample['input_len'] + # sort input batch + input_len, sort_index = torch.sort(input_len_unsorted, 0, descending=True) + input_len = input_len.numpy().tolist() + encoder_input = torch.index_select(encoder_input_unsorted, 0, sort_index) + # move to gpu + encoder_input = utils.move_to_cuda(encoder_input) + + _, _, _, predict_value = model.encoder(encoder_input, input_len) random_synthetic_target += predict_value.data.squeeze().tolist() assert len(random_synthetic_input) == len(random_synthetic_target) synthetic_input = random_synthetic_input @@ -148,7 +179,7 @@ def main(): args.source_length = args.encoder_length = args.decoder_length = (args.nodes + 2) * (args.nodes - 1) // 2 - nasbench = api.NASBench(os.path.join(args.data, 'nasbench_full.tfrecord')) + nasbench = api.NASBench(os.path.join(args.data, 'nasbench_only108.tfrecord')) controller = NAO( args.encoder_layers, @@ -174,8 +205,8 @@ def main(): logging.info('Iteration {}'.format(i+1)) if not child_arch_pool_valid_acc: for arch in child_arch_pool: - data = nasbench.query(arch) - child_arch_pool_valid_acc.append(data['validation_accuracy']) + val_acc = nasbench.query(arch, option='valid') + child_arch_pool_valid_acc.append(val_acc) arch_pool += child_arch_pool arch_pool_valid_acc += child_arch_pool_valid_acc @@ -200,8 +231,7 @@ def main(): print('Architecutre connection:{}'.format(arch_pool[arch_index].matrix)) print('Architecture operations:{}'.format(arch_pool[arch_index].ops)) print('Valid accuracy:{}'.format(arch_pool_valid_acc[arch_index])) - fs, cs = nasbench.get_metrics_from_spec(arch_pool[arch_index]) - test_acc = np.mean([cs[108][j]['final_test_accuracy'] for j in range(3)]) + test_acc = nasbench.query(arch_pool[arch_index], option='test') print('Mean test accuracy:{}'.format(test_acc)) break @@ -245,9 +275,9 @@ def main(): logging.info('Generate new architectures with step size %d', predict_step_size) new_seq, new_perfs = controller_infer(controller_infer_queue, controller, predict_step_size, direction='+') for seq in new_seq: - matrix, ops = utils.convert_seq_to_arch(seq) + matrix, ops = utils.convert_seq_to_arch(seq, nasbench.search_space) arch = api.ModelSpec(matrix=matrix, ops=ops) - if nasbench.is_valid(arch) and len(arch.ops) == 7 and seq not in train_encoder_input and seq not in new_seqs: + if nasbench.is_valid(arch) and seq not in train_encoder_input and seq not in new_seqs: new_archs.append(arch) new_seqs.append(seq) if len(new_seqs) >= args.new_arch: diff --git a/nasbench/utils.py b/nasbench/utils.py index 3e9c2d8..171cbdf 100644 --- a/nasbench/utils.py +++ b/nasbench/utils.py @@ -3,6 +3,7 @@ import torch import torch.utils.data import torch.nn.functional as F +from torch.autograd import Variable from nasbench import api INPUT = 'input' @@ -48,19 +49,19 @@ def generate_arch(n, nasbench, need_perf=False): np.random.shuffle(all_keys) for key in all_keys: fixed_stat, computed_stat = nasbench.get_metrics_from_hash(key) - if len(fixed_stat['module_operations']) < 7: - continue + #if len(fixed_stat['module_operations']) < 7: + # continue arch = api.ModelSpec( matrix=fixed_stat['module_adjacency'], ops=fixed_stat['module_operations'], ) if need_perf: - data = nasbench.query(arch) - if data['validation_accuracy'] < 0.9: + val_acc = nasbench.query(arch, option='valid') + if val_acc < 0.9: continue - valid_accs.append(data['validation_accuracy']) + valid_accs.append(val_acc) archs.append(arch) - seqs.append(convert_arch_to_seq(arch.matrix, arch.ops)) + seqs.append(convert_arch_to_seq(arch.matrix, arch.ops, nasbench.search_space)) count += 1 if count >= n: return archs, seqs, valid_accs @@ -75,74 +76,76 @@ def __init__(self, inputs, targets=None, train=True, sos_id=0, eos_id=0): super(ControllerDataset, self).__init__() if targets is not None: assert len(inputs) == len(targets) - self.inputs = inputs + self.inputs = inputs # list of seqs + self.len_inputs = [len(i) for i in inputs] + self.max_len = max(self.len_inputs) self.targets = targets self.train = train self.sos_id = sos_id self.eos_id = eos_id def __getitem__(self, index): - encoder_input = self.inputs[index] + encoder_input = self.inputs[index] + [0 for _ in range(self.max_len - len(self.inputs[index]))] # fix length as max_len + len_input = self.len_inputs[index] encoder_target = None if self.targets is not None: encoder_target = [self.targets[index]] if self.train: decoder_input = [self.sos_id] + encoder_input[:-1] sample = { - 'encoder_input': torch.LongTensor(encoder_input), - 'encoder_target': torch.FloatTensor(encoder_target), - 'decoder_input': torch.LongTensor(decoder_input), - 'decoder_target': torch.LongTensor(encoder_input), + 'encoder_input': np.array(encoder_input, dtype=np.int64), + 'encoder_target': np.array(encoder_target, dtype=np.float64), + 'decoder_input': np.array(decoder_input, dtype=np.int64), + 'decoder_target': np.array(encoder_input, dtype=np.int64), + 'input_len': len_input, } else: sample = { - 'encoder_input': torch.LongTensor(encoder_input), - 'decoder_target': torch.LongTensor(encoder_input), + 'encoder_input': np.array(encoder_input, dtype=np.int64), + 'decoder_target': np.array(encoder_input, dtype=np.int64), + 'input_len': len_input, } if encoder_target is not None: - sample['encoder_target'] = torch.FloatTensor(encoder_target) + sample['encoder_target'] = np.array(encoder_target, dtype=np.float64) return sample def __len__(self): return len(self.inputs) -def convert_arch_to_seq(matrix, ops): +def convert_arch_to_seq(matrix, ops, search_space): seq = [] n = len(matrix) assert n == len(ops) + for col in range(1, n): for row in range(col): seq.append(matrix[row][col]+1) - if ops[col] == CONV1X1: - seq.append(3) - elif ops[col] == CONV3X3: - seq.append(4) - elif ops[col] == MAXPOOL3X3: - seq.append(5) - if ops[col] == OUTPUT: - seq.append(6) + if ops[col] == 'output': + seq.append(len(search_space) + 3) + elif ops[col] != 'input': + seq.append(search_space.index(ops[col]) + 3) + assert len(seq) == (n+2)*(n-1)/2 return seq -def convert_seq_to_arch(seq): +def convert_seq_to_arch(seq, search_space): n = int(math.floor(math.sqrt((len(seq) + 1) * 2))) matrix = [[0 for _ in range(n)] for _ in range(n)] - ops = [INPUT] + ops = ['input'] + for i in range(n-1): offset=(i+3)*i//2 for j in range(i+1): matrix[j][i+1] = seq[offset+j] - 1 - if seq[offset+i+1] == 3: - op = CONV1X1 - elif seq[offset+i+1] == 4: - op = CONV3X3 - elif seq[offset+i+1] == 5: - op = MAXPOOL3X3 - elif seq[offset+i+1] == 6: - op = OUTPUT + idx = seq[offset+i+1] - 3 + if idx == len(search_space): + op = 'output' + else: + op = search_space[idx] ops.append(op) + return matrix, ops