diff --git a/README.md b/README.md
index 36adb5b..c1f6b73 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ LAMA exposes a transparent and unique interface to use:
 - BERT (Devlin et al., 2018)
 - ELMo (Peters et al., 2018)
 - GPT (Radford et al., 2018)
+- GPT-2 (Radford et al., 2019)
 - RoBERTa (Liu et al., 2019)
 
 Actually, LAMA is also a beautiful animal.
@@ -185,13 +186,19 @@ BERT pretrained models can be loaded both: (i) passing the name of the model and
 * __--bert-vocab-name/--bvn__ : name of vocabulary used to pre-train the BERT model (default = 'vocab.txt')
 
 
-### RoBERTa
+### RoBERTa (Fairseq)
 
 * __--roberta-model-dir/--rmd__ : directory that contains the RoBERTa pre-trained model and the vocabulary (__REQUIRED__)
 * __--roberta-model-name/--rmn__ : name of the RoBERTa pre-trained model (default = 'model.pt')
 * __--roberta-vocab-name/--rvn__ : name of vocabulary used to pre-train the RoBERTa model (default = 'dict.txt')
 
 
+### RoBERTa (HuggingFace)
+
+* __--hfroberta-model-dir/--hmd__ : directory that contains the HuggingFace RoBERTa pre-trained model and the vocabulary (__REQUIRED__)
+* __--hfroberta-model-name/--hmn__ : name of the HuggingFace RoBERTa pre-trained model (default = 'roberta-base')
+
+
 ### ELMo
 
 * __--elmo-model-dir/--emd__ : directory that contains the ELMo pre-trained model and the vocabulary (__REQUIRED__)
@@ -211,6 +218,12 @@ BERT pretrained models can be loaded both: (i) passing the name of the model and
 * __--gpt-model-name/--gmn__ : name of the gpt pre-trained model (default = 'openai-gpt')
 
 
+### GPT-2
+
+* __--gpt2-model-dir/--g2d__ : directory that contains the gpt2 pre-trained model and the vocabulary (__REQUIRED__)
+* __--gpt2-model-name/--g2n__ : name of the gpt2 pre-trained model (default = 'gpt2')
+
+
 ## Evaluate Language Model(s) Generation
 
 options:
diff --git a/download_models.sh b/download_models.sh
index 886092b..6694b62 100755
--- a/download_models.sh
+++ b/download_models.sh
@@ -29,6 +29,18 @@ if [[ ! -f gpt/openai-gpt/config.json ]]; then
   cd ../..
 fi
 
+echo "GPT2"
+if [[ ! -f gpt/gpt2/config.json ]]; then
+  rm -rf 'gpt/gpt2'
+  mkdir -p 'gpt/gpt2'
+  cd 'gpt/gpt2'
+  wget 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json' -O vocab.json
+  wget 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt' -O merges.txt
+  wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin' -O 'pytorch_model.bin'
+  wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json' -O 'config.json'
+  cd ../..
+fi
+
 echo "BERT BASE LOWERCASED"
 if [[ ! -f bert/uncased_L-12_H-768_A-12/bert_config.json ]]; then
   mkdir -p 'bert'
@@ -131,6 +143,29 @@ if [[ ! -f bert/cased_L-24_H-1024_A-16/bert_config.json ]]; then
   cd ../../
 fi
 
+echo "RoBERTa"
+if [[ ! -f roberta/roberta.base/dict.txt ]]; then
+  rm -rf 'roberta/roberta.base'
+  mkdir -p 'roberta/roberta.base'
+  cd 'roberta'
+  wget -c 'https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz'
+  tar -xzf roberta.base.tar.gz
+  rm roberta.base.tar.gz
+  cd ..
+fi
+
+echo "HuggingFace RoBERTa"
+if [[ ! -f roberta/roberta-base/config.json ]]; then
+  rm -rf 'roberta/roberta-base'
+  mkdir -p 'roberta/roberta-base'
+  cd 'roberta/roberta-base'
+  wget 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json' -O vocab.json
+  wget 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt' -O merges.txt
+  wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin' -O 'pytorch_model.bin'
+  wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json' -O 'config.json'
+  cd ../..
+fi
+
 
 cd "$ROOD_DIR"
 echo 'Building common vocab'
@@ -138,6 +173,6 @@ if [ ! -f "$DST_DIR/common_vocab_cased.txt" ]; then
   python lama/vocab_intersection.py
 else
   echo 'Already exists. Run to re-build:'
-  echo 'python util_KB_completion.py'
+  echo 'python lama/vocab_intersection.py'
 fi
 
diff --git a/lama/modules/__init__.py b/lama/modules/__init__.py
index 73c9fcf..d1f4620 100644
--- a/lama/modules/__init__.py
+++ b/lama/modules/__init__.py
@@ -9,6 +9,8 @@
 from .gpt_connector import GPT
 from .transformerxl_connector import TransformerXL
 from .roberta_connector import Roberta
+from .hfroberta_connector import HfRoberta
+from .gpt2_connector import GPT2
 
 
 def build_model_by_name(lm, args, verbose=True):
@@ -22,7 +24,9 @@ def build_model_by_name(lm, args, verbose=True):
         bert=Bert,
         gpt=GPT,
         transformerxl=TransformerXL,
-        roberta=Roberta
+        roberta=Roberta,
+        hfroberta=HfRoberta,
+        gpt2=GPT2
     )
     if lm not in MODEL_NAME_TO_CLASS:
         raise ValueError("Unrecognized Language Model: %s." % lm)
diff --git a/lama/modules/base_connector.py b/lama/modules/base_connector.py
index e32bf40..3127160 100644
--- a/lama/modules/base_connector.py
+++ b/lama/modules/base_connector.py
@@ -12,15 +12,24 @@
 BERT_CLS = "[CLS]"
 BERT_SEP = "[SEP]"
 BERT_PAD = "[PAD]"
+
 ELMO_UNK = "<UNK>"
 ELMO_START_SENTENCE = "<S>"
 ELMO_END_SENTENCE = "</S>"
+
 OPENAI_UNK = "<unk>"
 OPENAI_EOS = "<eos>"
-ROBERTA_MASK = "<mask>"
-ROBERTA_START_SENTENCE = "<s>"
-ROBERTA_END_SENTENCE = "</s>"
-ROBERTA_VOCAB_SIZE = 50266
+
+ROBERTA_MASK = "<mask>"         # MASK for fairseq/huggingface RoBERTa
+ROBERTA_VOCAB_SIZE = 50266      # for fairseq RoBERTa
+
+ROBERTA_START_SENTENCE = "<s>"  # BOS, CLS for huggingface RoBERTa
+ROBERTA_END_SENTENCE = "</s>"   # EOS, SEP for huggingface RoBERTa
+ROBERTA_UNK = "<unk>"           # UNK for huggingface RoBERTa
+ROBERTA_PAD = "<pad>"           # PAD for huggingface RoBERTa
+
+GPT2_EOS = "<|endoftext|>"      # BOS, EOS, UNK, PAD for GPT2
+
 
 SPECIAL_SYMBOLS = [
     MASK,
@@ -32,7 +41,13 @@
     ELMO_START_SENTENCE,
     ELMO_END_SENTENCE,
     OPENAI_UNK,
-    OPENAI_EOS
+    OPENAI_EOS,
+    ROBERTA_MASK,
+    # ROBERTA_UNK,
+    ROBERTA_PAD,
+    ROBERTA_START_SENTENCE,
+    ROBERTA_END_SENTENCE,
+    GPT2_EOS
     ]
 
 SPACE_NORMALIZER = re.compile(r"\s+")
diff --git a/lama/modules/gpt2_connector.py b/lama/modules/gpt2_connector.py
new file mode 100644
index 0000000..8e5b7b2
--- /dev/null
+++ b/lama/modules/gpt2_connector.py
@@ -0,0 +1,167 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
+import numpy as np
+from lama.modules.base_connector import *
+
+
+class GPT2(Base_Connector):
+
+    def __init__(self, args):
+        super().__init__()
+
+        if args.gpt2_model_dir is not None:
+            # load GPT2 model from file
+            gpt_model_name = str(args.gpt2_model_dir) + "/"
+            dict_file = gpt_model_name
+            print("loading GPT2 model from {}".format(gpt_model_name))
+        else:
+            # load GPT2 model from huggingface cache
+            gpt_model_name = args.gpt2_model_name
+            dict_file = gpt_model_name
+
+        # Load pre-trained model tokenizer (vocabulary)
+        self.tokenizer = GPT2Tokenizer.from_pretrained(dict_file)
+
+        # GPT uses different way to represent BPE then BERT. Namely, the
+        # final suffixes are indicated with </w> suffix, while pieces that must
+        # be followed are written as is. In BERT the prefixes are written as is
+        # while the parts that must follow (not be followed!) have '##' prefix.
+        # There is no one-to-one coversion. But at least we may make pieces that
+        # may form a full word look the same.
+        # Note that we should be very careful now,
+        # tokenizer.convert_tokens_to_ids won't work with our vocabulary.
+            
+        def convert_word(word):
+            if word == GPT2_EOS:
+                return word
+
+            if word.startswith('Ġ'):  # the token starts with a whitespace
+                return word[1:]
+
+            return f'_{word}_'  # the token not start with a white space.
+                                # may be not a head of a word,
+                                # or may be a head of a sentence.
+
+        _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items()))
+        self.vocab = [convert_word(word) for word in gpt_vocab]
+        self._init_inverse_vocab()
+
+        # Load pre-trained model (weights)
+        self.gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_name)
+        self.gpt_model.eval()
+        # print(self.gpt_model.config)
+
+        # Sanity check.
+        assert len(self.vocab) == self.gpt_model.config.vocab_size
+        #assert 0 == self.gpt_model.config.n_special
+
+        self.eos_id = self.gpt_model.config.eos_token_id
+        self.pad_id = self.gpt_model.config.eos_token_id
+        self.unk_id = self.gpt_model.config.eos_token_id
+        self.bos_id = self.gpt_model.config.bos_token_id
+        self.model_vocab = self.vocab
+
+    def _cuda(self):
+        self.gpt_model.cuda()
+
+    def get_id(self, string):
+        indexed_string = self.tokenizer.encode(f'a {string}')[1:]
+        return indexed_string
+
+    def __get_input_tensors(self, sentence_list):
+        """Concatenates, tokenize and converts a sentences to model inputs.
+
+        Args:
+            sentence_list: A list of strings. The string may contain a special
+            [MASK] token.
+
+        Returns:
+            A tuple (src_tensor, dst_tensor, masked_indices, tokenized_text).
+                src_tensor: torch.LongTensor with shape (seq_len), the input to
+                    the new without the last symbol and with EOS prepended.
+                dst_tensor: torch.LongTensor with shape (seq_len).
+                masked_indices: A list of indices of [MASK] in dst_tensor.
+                tokenized_text: A list of token string.
+            """
+        # Split the sentence by [MASK] and tokenize the chunks independently.
+        tokenized_text = []
+        masked_indices = []
+        for sentence_idx, sentence in enumerate(sentence_list):
+            if sentence_idx > 0:
+                tokenized_text.append(self.eos_id)
+            for chunk_idx, chunk in enumerate(sentence.split('[MASK]')):
+                if chunk_idx > 0:
+                    masked_indices.append(len(tokenized_text))
+                    tokenized_text.append(self.unk_id)  # use UNK as [MASK]
+                chunk = chunk.strip()
+                if chunk:
+                    tokenized_sentence = self.tokenizer.encode(chunk)
+                    tokenized_text.extend(tokenized_sentence)
+
+        full_indexed_tokens = [
+            self.bos_id
+        ] + tokenized_text
+        full_tokens_tensor = torch.tensor(full_indexed_tokens)
+        src_tensor = full_tokens_tensor[:-1]
+        dst_tensor = full_tokens_tensor[1:]
+
+        tokenized_text = self.tokenizer.decode(tokenized_text)
+
+        return src_tensor, dst_tensor, masked_indices, tokenized_text
+
+    def get_batch_generation(self, sentences_list, logger=None, try_cuda=True):
+        if try_cuda:
+            self.try_cuda()
+        src_tensor_list, dst_tensor_list, masked_indices_list, _ = zip(*[
+            self.__get_input_tensors(sentences) for sentences in sentences_list
+        ])
+
+        src_tensor_batch = torch.nn.utils.rnn.pad_sequence(
+            src_tensor_list, batch_first=True)
+
+        # The model uses shared embedding space for tokens and positions. More
+        # precisely, the first len(vocab) indidices are reseved for words, the
+        # last n_special symbols are reserved for special symbols and the rest
+        # is used for positions. Softmax and embedding matrices are shared and
+        # as result some of output "symbols" correspond to positions. To fix
+        # that we have to manually remove logits for positions.
+        with torch.no_grad():
+            logits = self.gpt_model(src_tensor_batch.to(self._model_device))[0]
+            logits = logits[..., :self.gpt_model.config.vocab_size]
+
+            log_probs = torch.nn.functional.log_softmax(logits, dim=-1).cpu()
+
+        token_ids_list = [
+            np.array(dst_tensor.numpy()) for dst_tensor in dst_tensor_list
+        ]
+
+        return log_probs, token_ids_list, masked_indices_list
+
+    def get_contextual_embeddings(self, sentences_list, try_cuda=True):
+
+        if try_cuda:
+            self.try_cuda()
+
+        src_tensor_list, dst_tensor_list, masked_indices_list, _ = zip(*[
+            self.__get_input_tensors(sentences) for sentences in sentences_list
+        ])
+
+        src_tensor_batch = torch.nn.utils.rnn.pad_sequence(
+            src_tensor_list, batch_first=True)
+
+        with torch.no_grad():
+            output = self.gpt_model.transformer(src_tensor_batch.to(self._model_device))
+
+        # TODO
+        sentence_lengths = None
+        tokenized_text_list = None
+
+        # As we only return the last layer, [] to have the same format as other models
+        return [output], sentence_lengths, tokenized_text_list
+
+
diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py
new file mode 100644
index 0000000..4cf7c94
--- /dev/null
+++ b/lama/modules/hfroberta_connector.py
@@ -0,0 +1,257 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+#
+from pytorch_transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM
+
+import torch
+import numpy as np
+from lama.modules.base_connector import *
+
+import torch.nn.functional as F
+
+class HfRoberta(Base_Connector):
+
+    def __init__(self, args):
+        super().__init__()
+
+        if args.hfroberta_model_dir is not None:
+            # load bert model from file
+            roberta_model_name = str(args.hfroberta_model_dir) + "/"
+            dict_file = roberta_model_name
+            print("loading huggingface RoBERTa model from {}".format(roberta_model_name))
+        else:
+            # load RoBERTa model from huggingface cache
+            roberta_model_name = args.hfroberta_model_name
+            dict_file = roberta_model_name
+
+        # When using a cased model, make sure to pass do_lower_case=False directly to BaseTokenizer
+        do_lower_case = False
+        if 'uncased' in roberta_model_name:
+            do_lower_case=True
+
+        # Load pre-trained model tokenizer (vocabulary)
+        self.tokenizer = RobertaTokenizer.from_pretrained(dict_file)
+
+        # original vocab
+
+        # The following process is baded on gpt_connector.
+
+        # RoBERTa also uses BPE. the bytes_to_unicode function takes all control
+        # and whitespace characters in code points 0-255 and shifts them up
+        # by 256 to make them printable. So space (code point 32) becomes Ġ (code point 288).
+        # (copied from https://github.com/openai/gpt-2/issues/80#issuecomment-487202159).
+        #
+        # Other control characters will be removed during voca_intersection process.
+        def convert_word(word):
+            if word == ROBERTA_UNK:
+                return word
+            if word == ROBERTA_MASK:
+                return word
+            if word == ROBERTA_START_SENTENCE:
+                return word
+            if word == ROBERTA_END_SENTENCE:
+                return word
+            if word == ROBERTA_PAD:
+                return word
+
+            if word.startswith('Ġ'):  # the token starts with a whitespace
+                return word[1:]
+            
+            return f'_{word}_'  # the token not start with a white space.
+                                # may be not a head of a word,
+                                # or may be a head of a sentence.
+
+            # need duplitation check?
+
+        _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items()))
+        self.vocab = [convert_word(word) for word in gpt_vocab]
+        self._init_inverse_vocab()
+
+        # Get UNK symbol as it's written in the origin RoBERTa vocab.
+        unk_index = self.inverse_vocab[ROBERTA_UNK]  # OPENAI_UNK
+        self.unk_symbol = self.tokenizer.decoder[unk_index]
+
+        # Get MASK symbol as it's written in the origin RoBERTa vocab.
+        mask_index = self.inverse_vocab[ROBERTA_MASK]
+        self.mask_symbol = self.tokenizer.decoder[mask_index]
+
+        # Load pre-trained model (weights)
+        self.masked_roberta_model = RobertaForMaskedLM.from_pretrained(roberta_model_name)
+        self.masked_roberta_model.eval()
+        #print(self.masked_roberta_model.config)
+
+        # ... to get hidden states
+        self.roberta_model = self.masked_roberta_model.roberta
+
+        # Sanity check.
+        #assert len(self.vocab) == self.masked_roberta_model.config.vocab_size
+        #assert 0 == self.masked_roberta_model.config.n_special
+
+        self.eos_id = self.inverse_vocab[ROBERTA_END_SENTENCE]  # OPENAI_EOS
+        self.model_vocab = self.vocab
+
+        self.pad_id = self.inverse_vocab[ROBERTA_PAD]
+        self.unk_index = self.inverse_vocab[ROBERTA_UNK]
+        self.mask_index = mask_index
+
+    def __get_token_ids_from_tensor(self, indexed_string):
+        token_ids = indexed_string
+        return token_ids
+
+    def _cuda(self):
+        self.masked_roberta_model.cuda()
+
+    def get_id(self, string):
+        # tokenize "a " + string, in order to create token_id(s) corresponding to the string.
+        # the first token of the string starts with a whitespace.
+        tokenized_text = self.tokenizer.tokenize(f'a {string}')
+        tokenized_text = tokenized_text[1:]
+        indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text)
+        return indexed_string
+
+    def __get_input_tensors_batch(self, sentences_list):
+        tokens_tensors_list = []
+        segments_tensors_list = []
+        masked_indices_list = []
+        tokenized_text_list = []
+        max_tokens = 0
+        for sentences in sentences_list:
+            tokens_tensor, segments_tensor, masked_indices, tokenized_text = self.__get_input_tensors(sentences)
+            tokens_tensors_list.append(tokens_tensor)
+            segments_tensors_list.append(segments_tensor)
+            masked_indices_list.append(masked_indices)
+            tokenized_text_list.append(tokenized_text)
+            # assert(tokens_tensor.shape[1] == segments_tensor.shape[1])
+            if (tokens_tensor.shape[1] > max_tokens):
+                max_tokens = tokens_tensor.shape[1]
+        # print("MAX_TOKENS: {}".format(max_tokens))
+        # apply padding and concatenate tensors
+        # use [PAD] for tokens and 0 for segments
+        final_tokens_tensor = None
+        final_segments_tensor = None
+        final_attention_mask = None
+        for tokens_tensor, segments_tensor in zip(tokens_tensors_list, segments_tensors_list):
+            dim_tensor = tokens_tensor.shape[1]
+            pad_lenght = max_tokens - dim_tensor
+            attention_tensor = torch.full([1,dim_tensor], 1, dtype= torch.long)
+            if pad_lenght>0:
+                pad_1 = torch.full([1,pad_lenght], self.pad_id, dtype= torch.long)
+                pad_2 = torch.full([1,pad_lenght], 0, dtype= torch.long)
+                attention_pad = torch.full([1,pad_lenght], 0, dtype= torch.long)
+                tokens_tensor = torch.cat((tokens_tensor,pad_1), dim=1)
+                segments_tensor = torch.cat((segments_tensor,pad_2), dim=1)
+                attention_tensor = torch.cat((attention_tensor,attention_pad), dim=1)
+            if final_tokens_tensor is None:
+                final_tokens_tensor = tokens_tensor
+                final_segments_tensor = segments_tensor
+                final_attention_mask = attention_tensor
+            else:
+                final_tokens_tensor = torch.cat((final_tokens_tensor,tokens_tensor), dim=0)
+                final_segments_tensor = torch.cat((final_segments_tensor,segments_tensor), dim=0)
+                final_attention_mask = torch.cat((final_attention_mask,attention_tensor), dim=0)
+        # print(final_tokens_tensor)
+        # print(final_segments_tensor)
+        # print(final_attention_mask)
+        # print(final_tokens_tensor.shape)
+        # print(final_segments_tensor.shape)
+        # print(final_attention_mask.shape)
+        return final_tokens_tensor, final_segments_tensor, final_attention_mask, masked_indices_list, tokenized_text_list
+
+    def __get_input_tensors(self, sentences):
+        tokenized_text = []
+        masked_indices = []
+        segment_indices = []
+
+        # add [CLS] or [BOS] token at the beginning
+        tokenized_text.append(ROBERTA_START_SENTENCE)
+        segment_indices.append(0)
+
+        for sentence_idx, sentence in enumerate(sentences):
+            for chunk_idx, chunk in enumerate(sentence.split('[MASK]')):
+                if chunk_idx > 0:
+                    masked_indices.append(len(tokenized_text))
+                    segment_indices.append(sentence_idx)
+                    tokenized_text.append(self.mask_symbol)
+
+                chunk = chunk.strip()
+                if chunk:
+                    tokenized_sentence = self.tokenizer.tokenize(chunk)
+                    segment_id = np.full(len(tokenized_sentence),
+                                         sentence_idx,
+                                         dtype=int).tolist()
+
+                    tokenized_text.extend(tokenized_sentence)
+                    segment_indices.extend(segment_id)
+
+            # add [EOS] or [SEP] token at the end of sequence or sentence
+            tokenized_text.append(ROBERTA_END_SENTENCE)
+            segment_indices.append(sentence_idx)
+
+        # look for masked indices
+        masked_indices = []
+        for i in range(len(tokenized_text)):
+            token = tokenized_text[i]
+            if token == ROBERTA_MASK:  # MASK
+                masked_indices.append(i)
+
+        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
+
+        # Convert inputs to PyTorch tensors
+        tokens_tensor = torch.tensor([indexed_tokens])
+        segments_tensors = torch.tensor([segment_indices])
+
+        return tokens_tensor, segments_tensors, masked_indices, tokenized_text
+
+
+    def get_batch_generation(self, sentences_list, logger=None, try_cuda=True):
+        if not sentences_list:
+            return None
+        if try_cuda:
+            self.try_cuda()
+        #print(sentences_list)
+        tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list)
+
+        if logger is not None:
+            logger.debug("\n{}\n".format(tokenized_text_list))
+
+        with torch.no_grad():
+            logits = self.masked_roberta_model(
+                input_ids=tokens_tensor.to(self._model_device),
+                token_type_ids=segments_tensor.to(self._model_device),
+                attention_mask=attention_mask_tensor.to(self._model_device),
+            )[0]
+
+            log_probs = F.log_softmax(logits, dim=-1).cpu()
+
+        token_ids_list = []
+        for indexed_string in tokens_tensor.numpy():
+            token_ids_list.append(self.__get_token_ids_from_tensor(indexed_string))
+
+        return log_probs, token_ids_list, masked_indices_list
+
+    def get_contextual_embeddings(self, sentences_list, try_cuda=True):
+
+        # assume in input 1 or 2 sentences - in general, it considers only the first 2 sentences
+        if not sentences_list:
+            return None
+        if try_cuda:
+            self.try_cuda()
+
+        tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list)
+
+        with torch.no_grad():
+            all_encoder_layers, _ = self.roberta_model(
+                tokens_tensor.to(self._model_device),
+                segments_tensor.to(self._model_device))
+
+        all_encoder_layers = [layer.cpu() for layer in all_encoder_layers]
+
+        sentence_lengths = [len(x) for x in tokenized_text_list]
+
+        # all_encoder_layers: a list of the full sequences of encoded-hidden-states at the end
+        # of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
+        # encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+        return all_encoder_layers, sentence_lengths, tokenized_text_list
diff --git a/lama/options.py b/lama/options.py
index a767ef7..e1c754f 100644
--- a/lama/options.py
+++ b/lama/options.py
@@ -48,6 +48,8 @@ def get_general_parser():
     __add_gpt_args(parser)
     __add_transformerxl_args(parser)
     __add_roberta_args(parser)
+    __add_hfroberta_args(parser)
+    __add_gpt2_args(parser)
     return parser
 
 
@@ -156,6 +158,24 @@ def __add_roberta_args(parser):
     return group
 
 
+def __add_hfroberta_args(parser):
+    group = parser.add_argument_group("HuggingFace RoBERTa")
+    group.add_argument(
+        "--hfroberta-model-dir",
+        "--hmd",
+        dest="hfroberta_model_dir",
+        help="directory that contains the HuggingFace ROBERTA pre-trained model and the vocabulary",
+    )
+    group.add_argument(
+        "--hfroberta-model-name",
+        "--hmn",
+        dest="hfroberta_model_name",
+        default="roberta-base",
+        help="name of the HuggingFace ROBERTA pre-trained model (default = 'model.pt')",
+    )
+    return group
+
+
 def __add_gpt_args(parser):
     group = parser.add_argument_group("GPT")
     group.add_argument(
@@ -174,6 +194,24 @@ def __add_gpt_args(parser):
     return group
 
 
+def __add_gpt2_args(parser):
+    group = parser.add_argument_group("GPT2")
+    group.add_argument(
+        "--gpt2-model-dir",
+        "--g2d",
+        dest="gpt2_model_dir",
+        help="directory that contains the gpt2 pre-trained model and the vocabulary",
+    )
+    group.add_argument(
+        "--gpt2-model-name",
+        "--g2n",
+        dest="gpt2_model_name",
+        default="gpt2",
+        help="name of the gpt2 pre-trained model (default = 'gpt2')",
+    )
+    return group
+
+
 def __add_transformerxl_args(parser):
     group = parser.add_argument_group("GPT")
     group.add_argument(
diff --git a/lama/vocab_intersection.py b/lama/vocab_intersection.py
index 235024c..a838814 100644
--- a/lama/vocab_intersection.py
+++ b/lama/vocab_intersection.py
@@ -21,11 +21,11 @@
   #   "cpu": True,
   #   "output_dictionary_size": -1
   # },
-  {
-    # "TransformerXL"
-    "lm": "transformerxl",
-    "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/",
-  },
+  # {
+  #   # "TransformerXL"
+  #   "lm": "transformerxl",
+  #   "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/",
+  # },
   {
     # "ELMO ORIGINAL"
     "lm": "elmo",
@@ -55,32 +55,55 @@
     "bert_model_name": "bert-large-cased",
     "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16/",
     "bert_vocab_name": "vocab.txt"
-  }
+  },
+  {
+    # "RoBERTa base"
+    "lm" : "roberta",
+    "roberta_model_name": "model.pt",
+    "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base",
+    "roberta_vocab_name": "dict.txt",
+    "max_sentence_length": 100
+  },
+  {
+    # "hfRoBERTa base"
+    "lm" : "hfroberta",
+    "hfroberta_model_name": "roberta-base",
+    "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base",
+  },
+  {
+    # "OpenAI GPT-2"
+    "lm": "gpt2",
+    "gpt2_model_name": "gpt2",
+    "gpt2_model_dir": "pre-trained_language_models/gpt/gpt2",
+  },
 ]
 
 CASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_cased.txt"
 
 LOWERCASED_MODELS = [
- {
-   # "BERT BASE UNCASED"
-   "lm": "bert",
-   "bert_model_name": "bert-base-uncased",
-   "bert_model_dir": None,
-   "bert_vocab_name": "vocab.txt"
- },
- {
-   # "BERT LARGE UNCASED"
-   "lm": "bert",
-   "bert_model_name": "bert-large-uncased",
-   "bert_model_dir": None,
-   "bert_vocab_name": "vocab.txt"
- },
- {
-   # "OpenAI GPT"
-   "lm": "gpt",
-   "gpt_model_dir": None,
-   "gpt_model_name": "openai-gpt"
- }
+  {
+    # "BERT BASE UNCASED"
+    "lm": "bert",
+    "bert_model_name": "bert-base-uncased",
+    #"bert_model_dir": None,
+    "bert_model_dir": "pre-trained_language_models/bert/uncased_L-12_H-768_A-12",
+    "bert_vocab_name": "vocab.txt"
+  },
+  {
+    # "BERT LARGE UNCASED"
+    "lm": "bert",
+    "bert_model_name": "bert-large-uncased",
+    #"bert_model_dir": None,
+    "bert_model_dir": "pre-trained_language_models/bert/uncased_L-24_H-1024_A-16",
+    "bert_vocab_name": "vocab.txt"
+  },
+  {
+    # "OpenAI GPT"
+    "lm": "gpt",
+    #"gpt_model_dir": None,
+    "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt",
+    "gpt_model_name": "openai-gpt"
+  },
 ]
 
 LOWERCASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_lowercased.txt"
diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py
index 1b8fec4..708c11d 100644
--- a/scripts/batch_eval_KB_completion.py
+++ b/scripts/batch_eval_KB_completion.py
@@ -201,12 +201,22 @@ def lowercase_samples(samples, use_negated_probes=False):
     new_samples = []
     for sample in samples:
         sample["obj_label"] = sample["obj_label"].lower()
-        sample["sub_label"] = sample["sub_label"].lower()
+        try:
+            sample["sub_label"] = sample["sub_label"].lower()
+        except KeyError:  # ConceptNet
+            None
         lower_masked_sentences = []
-        for sentence in sample["masked_sentences"]:
-            sentence = sentence.lower()
-            sentence = sentence.replace(base.MASK.lower(), base.MASK)
-            lower_masked_sentences.append(sentence)
+        try:
+            for sentence in sample["masked_sentences"]:
+                sentence = sentence.lower()
+                sentence = sentence.replace(base.MASK.lower(), base.MASK)
+                lower_masked_sentences.append(sentence)
+        except KeyError:
+            for evidence in sample['evidences']:  # TREx
+                sentence = evidence['masked_sentence']
+                sentence = sentence.lower()
+                sentence = sentence.replace(base.MASK.lower(), base.MASK)
+                lower_masked_sentences.append(sentence)
         sample["masked_sentences"] = lower_masked_sentences
 
         if "negated" in sample and use_negated_probes:
@@ -318,6 +328,10 @@ def main(args, shuffle_data=True, model=None):
         model_name = "BERT_{}".format(args.bert_model_name)
     elif model_type_name == "elmo":
         model_name = "ELMo_{}".format(args.elmo_model_name)
+    elif model_type_name == "roberta":
+        model_name = "RoBERTa_{}".format(args.roberta_model_name)
+    elif model_type_name == "hfroberta":
+        model_name = "hfRoBERTa_{}".format(args.hfroberta_model_name)
     else:
         model_name = model_type_name.title()
 
@@ -385,6 +399,14 @@ def main(args, shuffle_data=True, model=None):
     else:
         # keep samples as they are
         all_samples = data
+        # TREx data
+        for i, sample in enumerate(all_samples):
+            if 'masked_sentences' not in sample.keys():
+                sample['masked_sentences'] = []
+                for evidence in sample['evidences']:
+                    sample['masked_sentences'].append(evidence['masked_sentence'])
+                if i == 0:
+                    print('not masked_sentences, but masked_sentence.')
 
     all_samples, ret_msg = filter_samples(
         model, data, vocab_subset, args.max_sentence_length, args.template
@@ -646,12 +668,15 @@ def main(args, shuffle_data=True, model=None):
     pool.join()
 
     # stats
-    # Mean reciprocal rank
-    MRR /= len(list_of_results)
-
-    # Precision
-    Precision /= len(list_of_results)
-    Precision1 /= len(list_of_results)
+    try:
+       # Mean reciprocal rank
+       MRR /= len(list_of_results)
+
+       # Precision
+       Precision /= len(list_of_results)
+       Precision1 /= len(list_of_results)
+    except ZeroDivisionError:
+       MRR = Precision = Precision1 = 0.0
 
     msg = "all_samples: {}\n".format(len(all_samples))
     msg += "list_of_results: {}\n".format(len(list_of_results))
diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py
index 024e681..b06c82f 100644
--- a/scripts/run_experiments.py
+++ b/scripts/run_experiments.py
@@ -17,13 +17,13 @@
 from collections import defaultdict
 
 LMs = [
-    {
-        "lm": "transformerxl",
-        "label": "transformerxl",
-        "models_names": ["transformerxl"],
-        "transformerxl_model_name": "transfo-xl-wt103",
-        "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/",
-    },
+    #{
+    #    "lm": "transformerxl",
+    #    "label": "transformerxl",
+    #    "models_names": ["transformerxl"],
+    #    "transformerxl_model_name": "transfo-xl-wt103",
+    #    "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/",
+    #},
     {
         "lm": "elmo",
         "label": "elmo",
@@ -56,9 +56,39 @@
         "bert_model_name": "bert-large-cased",
         "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16",
     },
+    {
+        "lm": "gpt",
+        "label": "gpt",
+        "models_names": ["gpt"],
+        "gpt_model_name": "openai-gpt",
+        "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt/",
+        "lowercase": True,
+        "common_vocab_filename": "pre-trained_language_models/common_vocab_lowercased.txt",
+    },
+    {
+        "lm": "roberta",
+        "label": "roberta.base",
+        "models_names": ["roberta"],
+        "roberta_model_name": "model.pt",
+        "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base",
+        "roberta_vocab_name": "dict.txt",
+    },
+    {
+        "lm": "hfroberta",
+        "label": "roberta-base",
+        "models_names": ["hfroberta"],
+        "hfroberta_model_name": "roberta-base",
+        "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base",
+    },
+    {
+        "lm": "gpt2",
+        "label": "gpt2",
+        "models_names": ["gpt2"],
+        "gpt2_model_name": "gpt2",
+        "gpt2_model_dir": "pre-trained_language_models/gpt/gpt2",
+    },
 ]
 
-
 def run_experiments(
     relations,
     data_path_pre,
@@ -79,7 +109,11 @@ def run_experiments(
     type_Precision1 = defaultdict(list)
     type_count = defaultdict(list)
 
-    results_file = open("last_results.csv", "w+")
+    results_file = open("last_results.csv", "a+")
+    results_file.write(
+        "{},{}\n".format("lm_label", input_param["label"])
+    )
+    results_file.flush()
 
     for relation in relations:
         pp.pprint(relation)