diff --git a/README.md b/README.md index 36adb5b..c1f6b73 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ LAMA exposes a transparent and unique interface to use: - BERT (Devlin et al., 2018) - ELMo (Peters et al., 2018) - GPT (Radford et al., 2018) +- GPT-2 (Radford et al., 2019) - RoBERTa (Liu et al., 2019) Actually, LAMA is also a beautiful animal. @@ -185,13 +186,19 @@ BERT pretrained models can be loaded both: (i) passing the name of the model and * __--bert-vocab-name/--bvn__ : name of vocabulary used to pre-train the BERT model (default = 'vocab.txt') -### RoBERTa +### RoBERTa (Fairseq) * __--roberta-model-dir/--rmd__ : directory that contains the RoBERTa pre-trained model and the vocabulary (__REQUIRED__) * __--roberta-model-name/--rmn__ : name of the RoBERTa pre-trained model (default = 'model.pt') * __--roberta-vocab-name/--rvn__ : name of vocabulary used to pre-train the RoBERTa model (default = 'dict.txt') +### RoBERTa (HuggingFace) + +* __--hfroberta-model-dir/--hmd__ : directory that contains the HuggingFace RoBERTa pre-trained model and the vocabulary (__REQUIRED__) +* __--hfroberta-model-name/--hmn__ : name of the HuggingFace RoBERTa pre-trained model (default = 'roberta-base') + + ### ELMo * __--elmo-model-dir/--emd__ : directory that contains the ELMo pre-trained model and the vocabulary (__REQUIRED__) @@ -211,6 +218,12 @@ BERT pretrained models can be loaded both: (i) passing the name of the model and * __--gpt-model-name/--gmn__ : name of the gpt pre-trained model (default = 'openai-gpt') +### GPT-2 + +* __--gpt2-model-dir/--g2d__ : directory that contains the gpt2 pre-trained model and the vocabulary (__REQUIRED__) +* __--gpt2-model-name/--g2n__ : name of the gpt2 pre-trained model (default = 'gpt2') + + ## Evaluate Language Model(s) Generation options: diff --git a/download_models.sh b/download_models.sh index 886092b..6694b62 100755 --- a/download_models.sh +++ b/download_models.sh @@ -29,6 +29,18 @@ if [[ ! -f gpt/openai-gpt/config.json ]]; then cd ../.. fi +echo "GPT2" +if [[ ! -f gpt/gpt2/config.json ]]; then + rm -rf 'gpt/gpt2' + mkdir -p 'gpt/gpt2' + cd 'gpt/gpt2' + wget 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json' -O vocab.json + wget 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt' -O merges.txt + wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin' -O 'pytorch_model.bin' + wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json' -O 'config.json' + cd ../.. +fi + echo "BERT BASE LOWERCASED" if [[ ! -f bert/uncased_L-12_H-768_A-12/bert_config.json ]]; then mkdir -p 'bert' @@ -131,6 +143,29 @@ if [[ ! -f bert/cased_L-24_H-1024_A-16/bert_config.json ]]; then cd ../../ fi +echo "RoBERTa" +if [[ ! -f roberta/roberta.base/dict.txt ]]; then + rm -rf 'roberta/roberta.base' + mkdir -p 'roberta/roberta.base' + cd 'roberta' + wget -c 'https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz' + tar -xzf roberta.base.tar.gz + rm roberta.base.tar.gz + cd .. +fi + +echo "HuggingFace RoBERTa" +if [[ ! -f roberta/roberta-base/config.json ]]; then + rm -rf 'roberta/roberta-base' + mkdir -p 'roberta/roberta-base' + cd 'roberta/roberta-base' + wget 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json' -O vocab.json + wget 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt' -O merges.txt + wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin' -O 'pytorch_model.bin' + wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json' -O 'config.json' + cd ../.. +fi + cd "$ROOD_DIR" echo 'Building common vocab' @@ -138,6 +173,6 @@ if [ ! -f "$DST_DIR/common_vocab_cased.txt" ]; then python lama/vocab_intersection.py else echo 'Already exists. Run to re-build:' - echo 'python util_KB_completion.py' + echo 'python lama/vocab_intersection.py' fi diff --git a/lama/modules/__init__.py b/lama/modules/__init__.py index 73c9fcf..d1f4620 100644 --- a/lama/modules/__init__.py +++ b/lama/modules/__init__.py @@ -9,6 +9,8 @@ from .gpt_connector import GPT from .transformerxl_connector import TransformerXL from .roberta_connector import Roberta +from .hfroberta_connector import HfRoberta +from .gpt2_connector import GPT2 def build_model_by_name(lm, args, verbose=True): @@ -22,7 +24,9 @@ def build_model_by_name(lm, args, verbose=True): bert=Bert, gpt=GPT, transformerxl=TransformerXL, - roberta=Roberta + roberta=Roberta, + hfroberta=HfRoberta, + gpt2=GPT2 ) if lm not in MODEL_NAME_TO_CLASS: raise ValueError("Unrecognized Language Model: %s." % lm) diff --git a/lama/modules/base_connector.py b/lama/modules/base_connector.py index e32bf40..3127160 100644 --- a/lama/modules/base_connector.py +++ b/lama/modules/base_connector.py @@ -12,15 +12,24 @@ BERT_CLS = "[CLS]" BERT_SEP = "[SEP]" BERT_PAD = "[PAD]" + ELMO_UNK = "" ELMO_START_SENTENCE = "" ELMO_END_SENTENCE = "" + OPENAI_UNK = "" OPENAI_EOS = "" -ROBERTA_MASK = "" -ROBERTA_START_SENTENCE = "" -ROBERTA_END_SENTENCE = "" -ROBERTA_VOCAB_SIZE = 50266 + +ROBERTA_MASK = "" # MASK for fairseq/huggingface RoBERTa +ROBERTA_VOCAB_SIZE = 50266 # for fairseq RoBERTa + +ROBERTA_START_SENTENCE = "" # BOS, CLS for huggingface RoBERTa +ROBERTA_END_SENTENCE = "" # EOS, SEP for huggingface RoBERTa +ROBERTA_UNK = "" # UNK for huggingface RoBERTa +ROBERTA_PAD = "" # PAD for huggingface RoBERTa + +GPT2_EOS = "<|endoftext|>" # BOS, EOS, UNK, PAD for GPT2 + SPECIAL_SYMBOLS = [ MASK, @@ -32,7 +41,13 @@ ELMO_START_SENTENCE, ELMO_END_SENTENCE, OPENAI_UNK, - OPENAI_EOS + OPENAI_EOS, + ROBERTA_MASK, + # ROBERTA_UNK, + ROBERTA_PAD, + ROBERTA_START_SENTENCE, + ROBERTA_END_SENTENCE, + GPT2_EOS ] SPACE_NORMALIZER = re.compile(r"\s+") diff --git a/lama/modules/gpt2_connector.py b/lama/modules/gpt2_connector.py new file mode 100644 index 0000000..8e5b7b2 --- /dev/null +++ b/lama/modules/gpt2_connector.py @@ -0,0 +1,167 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# +from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer +import numpy as np +from lama.modules.base_connector import * + + +class GPT2(Base_Connector): + + def __init__(self, args): + super().__init__() + + if args.gpt2_model_dir is not None: + # load GPT2 model from file + gpt_model_name = str(args.gpt2_model_dir) + "/" + dict_file = gpt_model_name + print("loading GPT2 model from {}".format(gpt_model_name)) + else: + # load GPT2 model from huggingface cache + gpt_model_name = args.gpt2_model_name + dict_file = gpt_model_name + + # Load pre-trained model tokenizer (vocabulary) + self.tokenizer = GPT2Tokenizer.from_pretrained(dict_file) + + # GPT uses different way to represent BPE then BERT. Namely, the + # final suffixes are indicated with suffix, while pieces that must + # be followed are written as is. In BERT the prefixes are written as is + # while the parts that must follow (not be followed!) have '##' prefix. + # There is no one-to-one coversion. But at least we may make pieces that + # may form a full word look the same. + # Note that we should be very careful now, + # tokenizer.convert_tokens_to_ids won't work with our vocabulary. + + def convert_word(word): + if word == GPT2_EOS: + return word + + if word.startswith('Ġ'): # the token starts with a whitespace + return word[1:] + + return f'_{word}_' # the token not start with a white space. + # may be not a head of a word, + # or may be a head of a sentence. + + _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items())) + self.vocab = [convert_word(word) for word in gpt_vocab] + self._init_inverse_vocab() + + # Load pre-trained model (weights) + self.gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_name) + self.gpt_model.eval() + # print(self.gpt_model.config) + + # Sanity check. + assert len(self.vocab) == self.gpt_model.config.vocab_size + #assert 0 == self.gpt_model.config.n_special + + self.eos_id = self.gpt_model.config.eos_token_id + self.pad_id = self.gpt_model.config.eos_token_id + self.unk_id = self.gpt_model.config.eos_token_id + self.bos_id = self.gpt_model.config.bos_token_id + self.model_vocab = self.vocab + + def _cuda(self): + self.gpt_model.cuda() + + def get_id(self, string): + indexed_string = self.tokenizer.encode(f'a {string}')[1:] + return indexed_string + + def __get_input_tensors(self, sentence_list): + """Concatenates, tokenize and converts a sentences to model inputs. + + Args: + sentence_list: A list of strings. The string may contain a special + [MASK] token. + + Returns: + A tuple (src_tensor, dst_tensor, masked_indices, tokenized_text). + src_tensor: torch.LongTensor with shape (seq_len), the input to + the new without the last symbol and with EOS prepended. + dst_tensor: torch.LongTensor with shape (seq_len). + masked_indices: A list of indices of [MASK] in dst_tensor. + tokenized_text: A list of token string. + """ + # Split the sentence by [MASK] and tokenize the chunks independently. + tokenized_text = [] + masked_indices = [] + for sentence_idx, sentence in enumerate(sentence_list): + if sentence_idx > 0: + tokenized_text.append(self.eos_id) + for chunk_idx, chunk in enumerate(sentence.split('[MASK]')): + if chunk_idx > 0: + masked_indices.append(len(tokenized_text)) + tokenized_text.append(self.unk_id) # use UNK as [MASK] + chunk = chunk.strip() + if chunk: + tokenized_sentence = self.tokenizer.encode(chunk) + tokenized_text.extend(tokenized_sentence) + + full_indexed_tokens = [ + self.bos_id + ] + tokenized_text + full_tokens_tensor = torch.tensor(full_indexed_tokens) + src_tensor = full_tokens_tensor[:-1] + dst_tensor = full_tokens_tensor[1:] + + tokenized_text = self.tokenizer.decode(tokenized_text) + + return src_tensor, dst_tensor, masked_indices, tokenized_text + + def get_batch_generation(self, sentences_list, logger=None, try_cuda=True): + if try_cuda: + self.try_cuda() + src_tensor_list, dst_tensor_list, masked_indices_list, _ = zip(*[ + self.__get_input_tensors(sentences) for sentences in sentences_list + ]) + + src_tensor_batch = torch.nn.utils.rnn.pad_sequence( + src_tensor_list, batch_first=True) + + # The model uses shared embedding space for tokens and positions. More + # precisely, the first len(vocab) indidices are reseved for words, the + # last n_special symbols are reserved for special symbols and the rest + # is used for positions. Softmax and embedding matrices are shared and + # as result some of output "symbols" correspond to positions. To fix + # that we have to manually remove logits for positions. + with torch.no_grad(): + logits = self.gpt_model(src_tensor_batch.to(self._model_device))[0] + logits = logits[..., :self.gpt_model.config.vocab_size] + + log_probs = torch.nn.functional.log_softmax(logits, dim=-1).cpu() + + token_ids_list = [ + np.array(dst_tensor.numpy()) for dst_tensor in dst_tensor_list + ] + + return log_probs, token_ids_list, masked_indices_list + + def get_contextual_embeddings(self, sentences_list, try_cuda=True): + + if try_cuda: + self.try_cuda() + + src_tensor_list, dst_tensor_list, masked_indices_list, _ = zip(*[ + self.__get_input_tensors(sentences) for sentences in sentences_list + ]) + + src_tensor_batch = torch.nn.utils.rnn.pad_sequence( + src_tensor_list, batch_first=True) + + with torch.no_grad(): + output = self.gpt_model.transformer(src_tensor_batch.to(self._model_device)) + + # TODO + sentence_lengths = None + tokenized_text_list = None + + # As we only return the last layer, [] to have the same format as other models + return [output], sentence_lengths, tokenized_text_list + + diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py new file mode 100644 index 0000000..4cf7c94 --- /dev/null +++ b/lama/modules/hfroberta_connector.py @@ -0,0 +1,257 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# +from pytorch_transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM + +import torch +import numpy as np +from lama.modules.base_connector import * + +import torch.nn.functional as F + +class HfRoberta(Base_Connector): + + def __init__(self, args): + super().__init__() + + if args.hfroberta_model_dir is not None: + # load bert model from file + roberta_model_name = str(args.hfroberta_model_dir) + "/" + dict_file = roberta_model_name + print("loading huggingface RoBERTa model from {}".format(roberta_model_name)) + else: + # load RoBERTa model from huggingface cache + roberta_model_name = args.hfroberta_model_name + dict_file = roberta_model_name + + # When using a cased model, make sure to pass do_lower_case=False directly to BaseTokenizer + do_lower_case = False + if 'uncased' in roberta_model_name: + do_lower_case=True + + # Load pre-trained model tokenizer (vocabulary) + self.tokenizer = RobertaTokenizer.from_pretrained(dict_file) + + # original vocab + + # The following process is baded on gpt_connector. + + # RoBERTa also uses BPE. the bytes_to_unicode function takes all control + # and whitespace characters in code points 0-255 and shifts them up + # by 256 to make them printable. So space (code point 32) becomes Ġ (code point 288). + # (copied from https://github.com/openai/gpt-2/issues/80#issuecomment-487202159). + # + # Other control characters will be removed during voca_intersection process. + def convert_word(word): + if word == ROBERTA_UNK: + return word + if word == ROBERTA_MASK: + return word + if word == ROBERTA_START_SENTENCE: + return word + if word == ROBERTA_END_SENTENCE: + return word + if word == ROBERTA_PAD: + return word + + if word.startswith('Ġ'): # the token starts with a whitespace + return word[1:] + + return f'_{word}_' # the token not start with a white space. + # may be not a head of a word, + # or may be a head of a sentence. + + # need duplitation check? + + _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items())) + self.vocab = [convert_word(word) for word in gpt_vocab] + self._init_inverse_vocab() + + # Get UNK symbol as it's written in the origin RoBERTa vocab. + unk_index = self.inverse_vocab[ROBERTA_UNK] # OPENAI_UNK + self.unk_symbol = self.tokenizer.decoder[unk_index] + + # Get MASK symbol as it's written in the origin RoBERTa vocab. + mask_index = self.inverse_vocab[ROBERTA_MASK] + self.mask_symbol = self.tokenizer.decoder[mask_index] + + # Load pre-trained model (weights) + self.masked_roberta_model = RobertaForMaskedLM.from_pretrained(roberta_model_name) + self.masked_roberta_model.eval() + #print(self.masked_roberta_model.config) + + # ... to get hidden states + self.roberta_model = self.masked_roberta_model.roberta + + # Sanity check. + #assert len(self.vocab) == self.masked_roberta_model.config.vocab_size + #assert 0 == self.masked_roberta_model.config.n_special + + self.eos_id = self.inverse_vocab[ROBERTA_END_SENTENCE] # OPENAI_EOS + self.model_vocab = self.vocab + + self.pad_id = self.inverse_vocab[ROBERTA_PAD] + self.unk_index = self.inverse_vocab[ROBERTA_UNK] + self.mask_index = mask_index + + def __get_token_ids_from_tensor(self, indexed_string): + token_ids = indexed_string + return token_ids + + def _cuda(self): + self.masked_roberta_model.cuda() + + def get_id(self, string): + # tokenize "a " + string, in order to create token_id(s) corresponding to the string. + # the first token of the string starts with a whitespace. + tokenized_text = self.tokenizer.tokenize(f'a {string}') + tokenized_text = tokenized_text[1:] + indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) + return indexed_string + + def __get_input_tensors_batch(self, sentences_list): + tokens_tensors_list = [] + segments_tensors_list = [] + masked_indices_list = [] + tokenized_text_list = [] + max_tokens = 0 + for sentences in sentences_list: + tokens_tensor, segments_tensor, masked_indices, tokenized_text = self.__get_input_tensors(sentences) + tokens_tensors_list.append(tokens_tensor) + segments_tensors_list.append(segments_tensor) + masked_indices_list.append(masked_indices) + tokenized_text_list.append(tokenized_text) + # assert(tokens_tensor.shape[1] == segments_tensor.shape[1]) + if (tokens_tensor.shape[1] > max_tokens): + max_tokens = tokens_tensor.shape[1] + # print("MAX_TOKENS: {}".format(max_tokens)) + # apply padding and concatenate tensors + # use [PAD] for tokens and 0 for segments + final_tokens_tensor = None + final_segments_tensor = None + final_attention_mask = None + for tokens_tensor, segments_tensor in zip(tokens_tensors_list, segments_tensors_list): + dim_tensor = tokens_tensor.shape[1] + pad_lenght = max_tokens - dim_tensor + attention_tensor = torch.full([1,dim_tensor], 1, dtype= torch.long) + if pad_lenght>0: + pad_1 = torch.full([1,pad_lenght], self.pad_id, dtype= torch.long) + pad_2 = torch.full([1,pad_lenght], 0, dtype= torch.long) + attention_pad = torch.full([1,pad_lenght], 0, dtype= torch.long) + tokens_tensor = torch.cat((tokens_tensor,pad_1), dim=1) + segments_tensor = torch.cat((segments_tensor,pad_2), dim=1) + attention_tensor = torch.cat((attention_tensor,attention_pad), dim=1) + if final_tokens_tensor is None: + final_tokens_tensor = tokens_tensor + final_segments_tensor = segments_tensor + final_attention_mask = attention_tensor + else: + final_tokens_tensor = torch.cat((final_tokens_tensor,tokens_tensor), dim=0) + final_segments_tensor = torch.cat((final_segments_tensor,segments_tensor), dim=0) + final_attention_mask = torch.cat((final_attention_mask,attention_tensor), dim=0) + # print(final_tokens_tensor) + # print(final_segments_tensor) + # print(final_attention_mask) + # print(final_tokens_tensor.shape) + # print(final_segments_tensor.shape) + # print(final_attention_mask.shape) + return final_tokens_tensor, final_segments_tensor, final_attention_mask, masked_indices_list, tokenized_text_list + + def __get_input_tensors(self, sentences): + tokenized_text = [] + masked_indices = [] + segment_indices = [] + + # add [CLS] or [BOS] token at the beginning + tokenized_text.append(ROBERTA_START_SENTENCE) + segment_indices.append(0) + + for sentence_idx, sentence in enumerate(sentences): + for chunk_idx, chunk in enumerate(sentence.split('[MASK]')): + if chunk_idx > 0: + masked_indices.append(len(tokenized_text)) + segment_indices.append(sentence_idx) + tokenized_text.append(self.mask_symbol) + + chunk = chunk.strip() + if chunk: + tokenized_sentence = self.tokenizer.tokenize(chunk) + segment_id = np.full(len(tokenized_sentence), + sentence_idx, + dtype=int).tolist() + + tokenized_text.extend(tokenized_sentence) + segment_indices.extend(segment_id) + + # add [EOS] or [SEP] token at the end of sequence or sentence + tokenized_text.append(ROBERTA_END_SENTENCE) + segment_indices.append(sentence_idx) + + # look for masked indices + masked_indices = [] + for i in range(len(tokenized_text)): + token = tokenized_text[i] + if token == ROBERTA_MASK: # MASK + masked_indices.append(i) + + indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text) + + # Convert inputs to PyTorch tensors + tokens_tensor = torch.tensor([indexed_tokens]) + segments_tensors = torch.tensor([segment_indices]) + + return tokens_tensor, segments_tensors, masked_indices, tokenized_text + + + def get_batch_generation(self, sentences_list, logger=None, try_cuda=True): + if not sentences_list: + return None + if try_cuda: + self.try_cuda() + #print(sentences_list) + tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list) + + if logger is not None: + logger.debug("\n{}\n".format(tokenized_text_list)) + + with torch.no_grad(): + logits = self.masked_roberta_model( + input_ids=tokens_tensor.to(self._model_device), + token_type_ids=segments_tensor.to(self._model_device), + attention_mask=attention_mask_tensor.to(self._model_device), + )[0] + + log_probs = F.log_softmax(logits, dim=-1).cpu() + + token_ids_list = [] + for indexed_string in tokens_tensor.numpy(): + token_ids_list.append(self.__get_token_ids_from_tensor(indexed_string)) + + return log_probs, token_ids_list, masked_indices_list + + def get_contextual_embeddings(self, sentences_list, try_cuda=True): + + # assume in input 1 or 2 sentences - in general, it considers only the first 2 sentences + if not sentences_list: + return None + if try_cuda: + self.try_cuda() + + tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list) + + with torch.no_grad(): + all_encoder_layers, _ = self.roberta_model( + tokens_tensor.to(self._model_device), + segments_tensor.to(self._model_device)) + + all_encoder_layers = [layer.cpu() for layer in all_encoder_layers] + + sentence_lengths = [len(x) for x in tokenized_text_list] + + # all_encoder_layers: a list of the full sequences of encoded-hidden-states at the end + # of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each + # encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] + return all_encoder_layers, sentence_lengths, tokenized_text_list diff --git a/lama/options.py b/lama/options.py index a767ef7..e1c754f 100644 --- a/lama/options.py +++ b/lama/options.py @@ -48,6 +48,8 @@ def get_general_parser(): __add_gpt_args(parser) __add_transformerxl_args(parser) __add_roberta_args(parser) + __add_hfroberta_args(parser) + __add_gpt2_args(parser) return parser @@ -156,6 +158,24 @@ def __add_roberta_args(parser): return group +def __add_hfroberta_args(parser): + group = parser.add_argument_group("HuggingFace RoBERTa") + group.add_argument( + "--hfroberta-model-dir", + "--hmd", + dest="hfroberta_model_dir", + help="directory that contains the HuggingFace ROBERTA pre-trained model and the vocabulary", + ) + group.add_argument( + "--hfroberta-model-name", + "--hmn", + dest="hfroberta_model_name", + default="roberta-base", + help="name of the HuggingFace ROBERTA pre-trained model (default = 'model.pt')", + ) + return group + + def __add_gpt_args(parser): group = parser.add_argument_group("GPT") group.add_argument( @@ -174,6 +194,24 @@ def __add_gpt_args(parser): return group +def __add_gpt2_args(parser): + group = parser.add_argument_group("GPT2") + group.add_argument( + "--gpt2-model-dir", + "--g2d", + dest="gpt2_model_dir", + help="directory that contains the gpt2 pre-trained model and the vocabulary", + ) + group.add_argument( + "--gpt2-model-name", + "--g2n", + dest="gpt2_model_name", + default="gpt2", + help="name of the gpt2 pre-trained model (default = 'gpt2')", + ) + return group + + def __add_transformerxl_args(parser): group = parser.add_argument_group("GPT") group.add_argument( diff --git a/lama/vocab_intersection.py b/lama/vocab_intersection.py index 235024c..a838814 100644 --- a/lama/vocab_intersection.py +++ b/lama/vocab_intersection.py @@ -21,11 +21,11 @@ # "cpu": True, # "output_dictionary_size": -1 # }, - { - # "TransformerXL" - "lm": "transformerxl", - "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", - }, + # { + # # "TransformerXL" + # "lm": "transformerxl", + # "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", + # }, { # "ELMO ORIGINAL" "lm": "elmo", @@ -55,32 +55,55 @@ "bert_model_name": "bert-large-cased", "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16/", "bert_vocab_name": "vocab.txt" - } + }, + { + # "RoBERTa base" + "lm" : "roberta", + "roberta_model_name": "model.pt", + "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", + "roberta_vocab_name": "dict.txt", + "max_sentence_length": 100 + }, + { + # "hfRoBERTa base" + "lm" : "hfroberta", + "hfroberta_model_name": "roberta-base", + "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", + }, + { + # "OpenAI GPT-2" + "lm": "gpt2", + "gpt2_model_name": "gpt2", + "gpt2_model_dir": "pre-trained_language_models/gpt/gpt2", + }, ] CASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_cased.txt" LOWERCASED_MODELS = [ - { - # "BERT BASE UNCASED" - "lm": "bert", - "bert_model_name": "bert-base-uncased", - "bert_model_dir": None, - "bert_vocab_name": "vocab.txt" - }, - { - # "BERT LARGE UNCASED" - "lm": "bert", - "bert_model_name": "bert-large-uncased", - "bert_model_dir": None, - "bert_vocab_name": "vocab.txt" - }, - { - # "OpenAI GPT" - "lm": "gpt", - "gpt_model_dir": None, - "gpt_model_name": "openai-gpt" - } + { + # "BERT BASE UNCASED" + "lm": "bert", + "bert_model_name": "bert-base-uncased", + #"bert_model_dir": None, + "bert_model_dir": "pre-trained_language_models/bert/uncased_L-12_H-768_A-12", + "bert_vocab_name": "vocab.txt" + }, + { + # "BERT LARGE UNCASED" + "lm": "bert", + "bert_model_name": "bert-large-uncased", + #"bert_model_dir": None, + "bert_model_dir": "pre-trained_language_models/bert/uncased_L-24_H-1024_A-16", + "bert_vocab_name": "vocab.txt" + }, + { + # "OpenAI GPT" + "lm": "gpt", + #"gpt_model_dir": None, + "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt", + "gpt_model_name": "openai-gpt" + }, ] LOWERCASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_lowercased.txt" diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index 1b8fec4..708c11d 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -201,12 +201,22 @@ def lowercase_samples(samples, use_negated_probes=False): new_samples = [] for sample in samples: sample["obj_label"] = sample["obj_label"].lower() - sample["sub_label"] = sample["sub_label"].lower() + try: + sample["sub_label"] = sample["sub_label"].lower() + except KeyError: # ConceptNet + None lower_masked_sentences = [] - for sentence in sample["masked_sentences"]: - sentence = sentence.lower() - sentence = sentence.replace(base.MASK.lower(), base.MASK) - lower_masked_sentences.append(sentence) + try: + for sentence in sample["masked_sentences"]: + sentence = sentence.lower() + sentence = sentence.replace(base.MASK.lower(), base.MASK) + lower_masked_sentences.append(sentence) + except KeyError: + for evidence in sample['evidences']: # TREx + sentence = evidence['masked_sentence'] + sentence = sentence.lower() + sentence = sentence.replace(base.MASK.lower(), base.MASK) + lower_masked_sentences.append(sentence) sample["masked_sentences"] = lower_masked_sentences if "negated" in sample and use_negated_probes: @@ -318,6 +328,10 @@ def main(args, shuffle_data=True, model=None): model_name = "BERT_{}".format(args.bert_model_name) elif model_type_name == "elmo": model_name = "ELMo_{}".format(args.elmo_model_name) + elif model_type_name == "roberta": + model_name = "RoBERTa_{}".format(args.roberta_model_name) + elif model_type_name == "hfroberta": + model_name = "hfRoBERTa_{}".format(args.hfroberta_model_name) else: model_name = model_type_name.title() @@ -385,6 +399,14 @@ def main(args, shuffle_data=True, model=None): else: # keep samples as they are all_samples = data + # TREx data + for i, sample in enumerate(all_samples): + if 'masked_sentences' not in sample.keys(): + sample['masked_sentences'] = [] + for evidence in sample['evidences']: + sample['masked_sentences'].append(evidence['masked_sentence']) + if i == 0: + print('not masked_sentences, but masked_sentence.') all_samples, ret_msg = filter_samples( model, data, vocab_subset, args.max_sentence_length, args.template @@ -646,12 +668,15 @@ def main(args, shuffle_data=True, model=None): pool.join() # stats - # Mean reciprocal rank - MRR /= len(list_of_results) - - # Precision - Precision /= len(list_of_results) - Precision1 /= len(list_of_results) + try: + # Mean reciprocal rank + MRR /= len(list_of_results) + + # Precision + Precision /= len(list_of_results) + Precision1 /= len(list_of_results) + except ZeroDivisionError: + MRR = Precision = Precision1 = 0.0 msg = "all_samples: {}\n".format(len(all_samples)) msg += "list_of_results: {}\n".format(len(list_of_results)) diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 024e681..b06c82f 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -17,13 +17,13 @@ from collections import defaultdict LMs = [ - { - "lm": "transformerxl", - "label": "transformerxl", - "models_names": ["transformerxl"], - "transformerxl_model_name": "transfo-xl-wt103", - "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", - }, + #{ + # "lm": "transformerxl", + # "label": "transformerxl", + # "models_names": ["transformerxl"], + # "transformerxl_model_name": "transfo-xl-wt103", + # "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", + #}, { "lm": "elmo", "label": "elmo", @@ -56,9 +56,39 @@ "bert_model_name": "bert-large-cased", "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", }, + { + "lm": "gpt", + "label": "gpt", + "models_names": ["gpt"], + "gpt_model_name": "openai-gpt", + "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt/", + "lowercase": True, + "common_vocab_filename": "pre-trained_language_models/common_vocab_lowercased.txt", + }, + { + "lm": "roberta", + "label": "roberta.base", + "models_names": ["roberta"], + "roberta_model_name": "model.pt", + "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", + "roberta_vocab_name": "dict.txt", + }, + { + "lm": "hfroberta", + "label": "roberta-base", + "models_names": ["hfroberta"], + "hfroberta_model_name": "roberta-base", + "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", + }, + { + "lm": "gpt2", + "label": "gpt2", + "models_names": ["gpt2"], + "gpt2_model_name": "gpt2", + "gpt2_model_dir": "pre-trained_language_models/gpt/gpt2", + }, ] - def run_experiments( relations, data_path_pre, @@ -79,7 +109,11 @@ def run_experiments( type_Precision1 = defaultdict(list) type_count = defaultdict(list) - results_file = open("last_results.csv", "w+") + results_file = open("last_results.csv", "a+") + results_file.write( + "{},{}\n".format("lm_label", input_param["label"]) + ) + results_file.flush() for relation in relations: pp.pprint(relation)