From 348f6f51510c7b1dc864ceda797435e288f23f91 Mon Sep 17 00:00:00 2001
From: UrszulaCzerwinska <ulcia.liberte@gmail.com>
Date: Fri, 12 Mar 2021 16:19:01 +0000
Subject: [PATCH 1/8] changing config and classes construction for allennlp2

---
 scripts/train.sh                              |  5 +-
 .../config.jsonnet                            | 84 ++++++++-----------
 .../dataset_reader.py                         | 27 +++---
 sequential_sentence_classification/model.py   | 10 +--
 4 files changed, 55 insertions(+), 71 deletions(-)

diff --git a/scripts/train.sh b/scripts/train.sh
index 6ed1c61..74c3a62 100755
--- a/scripts/train.sh
+++ b/scripts/train.sh
@@ -5,8 +5,7 @@ export PYTORCH_SEED=`expr $SEED / 10`
 export NUMPY_SEED=`expr $PYTORCH_SEED / 10`
 
 # path to bert vocab and weights
-export BERT_VOCAB=https://ai2-s2-research.s3-us-west-2.amazonaws.com/scibert/allennlp_files/scivocab_uncased.vocab
-export BERT_WEIGHTS=https://ai2-s2-research.s3-us-west-2.amazonaws.com/scibert/allennlp_files/scibert_scivocab_uncased.tar.gz
+export BERT_MODEL=allenai/scibert_scivocab_uncased
 
 # path to dataset files
 export TRAIN_PATH=data/CSAbstruct/train.jsonl
@@ -35,4 +34,4 @@ export SCI_SUM_FAKE_SCORES=false  # use fake scores for testing
 
 CONFIG_FILE=sequential_sentence_classification/config.jsonnet
 
-python -m allennlp.run train $CONFIG_FILE  --include-package sequential_sentence_classification -s $SERIALIZATION_DIR "$@"
+python -m allennlp train $CONFIG_FILE  --include-package sequential_sentence_classification -s $SERIALIZATION_DIR "$@"
diff --git a/sequential_sentence_classification/config.jsonnet b/sequential_sentence_classification/config.jsonnet
index d42b0c8..1635e2c 100644
--- a/sequential_sentence_classification/config.jsonnet
+++ b/sequential_sentence_classification/config.jsonnet
@@ -12,45 +12,42 @@ local boolToInt(s) =
   "random_seed": std.parseInt(std.extVar("SEED")),
   "pytorch_seed": std.parseInt(std.extVar("PYTORCH_SEED")),
   "numpy_seed": std.parseInt(std.extVar("NUMPY_SEED")),
-  "dataset_reader":{
-    "type":"SeqClassificationReader",
-    "lazy": false,
-    "sent_max_len": std.extVar("SENT_MAX_LEN"),
-    "word_splitter": "bert-basic",
-    "max_sent_per_example": std.extVar("MAX_SENT_PER_EXAMPLE"),
-    "token_indexers": {
-          "bert": {
-              "type": "bert-pretrained",
-              "pretrained_model": std.extVar("BERT_VOCAB"),
-              "do_lowercase": true,
-              "use_starting_offsets": false
-          },
+    "dataset_reader" : {
+        "type": "SeqClassificationReader",
+        "tokenizer": {
+            "type": "pretrained_transformer",
+            "model_name": std.extVar("BERT_MODEL"),
+            "tokenizer_kwargs": {"truncation_strategy" : 'do_not_truncate'},
+        },
+        "token_indexers": {
+            "bert": {
+                "type": "pretrained_transformer",
+                "model_name": std.extVar("BERT_MODEL"),
+                "tokenizer_kwargs": {"truncation_strategy" : 'do_not_truncate'},
+            }
+        },
+        "sent_max_len": std.parseInt(std.extVar("SENT_MAX_LEN")),
+        "max_sent_per_example": 10,
+        "use_sep": stringToBool(std.extVar("USE_SEP")),
+        "sci_sum": stringToBool(std.extVar("SCI_SUM")),
+        "use_abstract_scores": stringToBool(std.extVar("USE_ABSTRACT_SCORES")),
+        "sci_sum_fake_scores": stringToBool(std.extVar("SCI_SUM_FAKE_SCORES")),
     },
-    "use_sep": std.extVar("USE_SEP"),
-    "sci_sum": stringToBool(std.extVar("SCI_SUM")),
-    "use_abstract_scores": stringToBool(std.extVar("USE_ABSTRACT_SCORES")),
-    "sci_sum_fake_scores": stringToBool(std.extVar("SCI_SUM_FAKE_SCORES")),
-  },
-
   "train_data_path": std.extVar("TRAIN_PATH"),
   "validation_data_path": std.extVar("DEV_PATH"),
   "test_data_path": std.extVar("TEST_PATH"),
   "evaluate_on_test": true,
   "model": {
     "type": "SeqClassificationModel",
-    "text_field_embedder": {
-        "allow_unmatched_keys": true,
-        "embedder_to_indexer_map": {
-            "bert": if stringToBool(std.extVar("USE_SEP")) then ["bert"] else ["bert", "bert-offsets"],
-            "tokens": ["tokens"],
-        },
+        "text_field_embedder": {
         "token_embedders": {
             "bert": {
-                "type": "bert-pretrained",
-                "pretrained_model": std.extVar("BERT_WEIGHTS"),
-                "requires_grad": 'all',
-                "top_layer_only": false,
-            }
+              "type": "pretrained_transformer",
+              "model_name": std.extVar("BERT_MODEL"),
+              "train_parameters": 1,
+              "last_layer_only": 1,
+
+        }
         }
     },
     "use_sep": std.extVar("USE_SEP"),
@@ -59,43 +56,30 @@ local boolToInt(s) =
     "sci_sum": stringToBool(std.extVar("SCI_SUM")),
     "additional_feature_size": boolToInt(stringToBool(std.extVar("USE_ABSTRACT_SCORES"))),
     "self_attn": {
-      "type": "stacked_self_attention",
+      "type": "pytorch_transformer",
       "input_dim": 768,
-      "projection_dim": 100,
       "feedforward_hidden_dim": 50,
       "num_layers": 2,
       "num_attention_heads": 2,
-      "hidden_dim": 100,
     },
   },
-  "iterator": {
-    "type": "bucket",
-    "sorting_keys": [["sentences", "num_fields"]],
-    "batch_size" : std.parseInt(std.extVar("BATCH_SIZE")),
-    "cache_instances": true,
-    "biggest_batch_first": true
-  },
-
+  "data_loader": {
+        "batch_size": std.parseInt(std.extVar("BATCH_SIZE")),
+        "shuffle": false,
+  }
   "trainer": {
     "num_epochs": std.parseInt(std.extVar("NUM_EPOCHS")),
     "grad_clipping": 1.0,
     "patience": 5,
     "model_save_interval": 3600,
     "validation_metric": if stringToBool(std.extVar("SCI_SUM")) then "-loss" else '+acc',
-    "min_delta": 0.001,
     "cuda_device": std.parseInt(std.extVar("cuda_device")),
     "gradient_accumulation_batch_size": 32,
     "optimizer": {
-      "type": "bert_adam",
-      "lr": std.extVar("LR"),
-      "t_total": -1,
-      "max_grad_norm": 1.0,
+      "type": "huggingface_adamw",
+      "lr": std.parseJson(std.extVar("LR")),
       "weight_decay": 0.01,
-      "parameter_groups": [
-        [["bias", "LayerNorm.bias", "LayerNorm.weight", "layer_norm.weight"], {"weight_decay": 0.0}],
-      ],
     },
-    "should_log_learning_rate": true,
     "learning_rate_scheduler": {
       "type": "slanted_triangular",
       "num_epochs": std.parseInt(std.extVar("NUM_EPOCHS")),
diff --git a/sequential_sentence_classification/dataset_reader.py b/sequential_sentence_classification/dataset_reader.py
index 789ecf2..f6772c5 100644
--- a/sequential_sentence_classification/dataset_reader.py
+++ b/sequential_sentence_classification/dataset_reader.py
@@ -7,14 +7,13 @@
 
 from allennlp.data.dataset_readers.dataset_reader import DatasetReader
 from allennlp.common.file_utils import cached_path
-from allennlp.data import Tokenizer
+from allennlp.data import TokenIndexer, Tokenizer
 from allennlp.data.instance import Instance
 from allennlp.data.fields.field import Field
 from allennlp.data.fields import TextField, LabelField, ListField, ArrayField, MultiLabelField
-from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
-from allennlp.data.tokenizers import WordTokenizer
+from allennlp.data.token_indexers import SingleIdTokenIndexer
+from allennlp.data.tokenizers import WhitespaceTokenizer
 from allennlp.data.tokenizers.token import Token
-from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter, WordSplitter, SpacyWordSplitter
 
 
 @DatasetReader.register("SeqClassificationReader")
@@ -31,9 +30,7 @@ class SeqClassificationReader(DatasetReader):
     """
 
     def __init__(self,
-                 lazy: bool = False,
                  token_indexers: Dict[str, TokenIndexer] = None,
-                 word_splitter: WordSplitter = None,
                  tokenizer: Tokenizer = None,
                  sent_max_len: int = 100,
                  max_sent_per_example: int = 20,
@@ -43,8 +40,9 @@ def __init__(self,
                  sci_sum_fake_scores: bool = True,
                  predict: bool = False,
                  ) -> None:
-        super().__init__(lazy)
-        self._tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter(pos_tags=False))
+        super().__init__(manual_distributed_sharding=True,
+            manual_multiprocess_sharding=True, **kwargs)
+        self._tokenizer = tokenizer or WhitespaceTokenizer()
         self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
         self.sent_max_len = sent_max_len
         self.use_sep = use_sep
@@ -54,12 +52,12 @@ def __init__(self,
         self.use_abstract_scores = use_abstract_scores
         self.sci_sum_fake_scores = sci_sum_fake_scores
 
-    @overrides
+
     def _read(self, file_path: str):
         file_path = cached_path(file_path)
 
         with open(file_path) as f:
-            for line in f:
+            for line in self.shard_iterable(f):
                 json_dict = json.loads(line)
                 instances = self.read_one_example(json_dict)
                 for instance in instances:
@@ -173,7 +171,6 @@ def filter_bad_sci_sum_sentences(self, sentences, labels):
 
         return sentences, labels
 
-    @overrides
     def text_to_instance(self,
                          sentences: List[str],
                          labels: List[str] = None,
@@ -199,7 +196,7 @@ def text_to_instance(self,
 
         fields: Dict[str, Field] = {}
         fields["sentences"] = ListField([
-                TextField(sentence, self._token_indexers)
+                TextField(sentence)
                 for sentence in sentences
         ])
 
@@ -223,4 +220,8 @@ def text_to_instance(self,
         if additional_features is not None:
             fields["additional_features"] = ArrayField(np.array(additional_features))
 
-        return Instance(fields)
\ No newline at end of file
+        return Instance(fields)
+
+    def apply_token_indexers(self, instance: Instance) -> None:
+        for text_field in instance["sentences"].field_list:
+            text_field.token_indexers = self._token_indexers
\ No newline at end of file
diff --git a/sequential_sentence_classification/model.py b/sequential_sentence_classification/model.py
index 6e7e9f8..cb9ff7e 100644
--- a/sequential_sentence_classification/model.py
+++ b/sequential_sentence_classification/model.py
@@ -57,7 +57,7 @@ def __init__(self, vocab: Vocabulary,
                 label_name = self.vocab.get_token_from_index(namespace='labels', index=label_index)
                 self.label_f1_metrics[label_name] = F1Measure(label_index)
 
-        encoded_senetence_dim = text_field_embedder._token_embedders['bert'].output_dim
+        encoded_senetence_dim = text_field_embedder._token_embedders['bert'].get_output_dim()
 
         ff_in_dim = encoded_senetence_dim if self.use_sep else self_attn.get_output_dim()
         ff_in_dim += self.additional_feature_size
@@ -94,7 +94,7 @@ def forward(self,  # type: ignore
         # Output: embedded_sentences
 
         # embedded_sentences: batch_size, num_sentences, sentence_length, embedding_size
-        embedded_sentences = self.text_field_embedder(sentences)
+        embedded_sentences = self.text_field_embedder(sentences, num_wrapping_dims= 1)
         mask = get_text_field_mask(sentences, num_wrapping_dims=1).float()
         batch_size, num_sentences, _, _ = embedded_sentences.size()
 
@@ -102,7 +102,7 @@ def forward(self,  # type: ignore
             # The following code collects vectors of the SEP tokens from all the examples in the batch,
             # and arrange them in one list. It does the same for the labels and confidences.
             # TODO: replace 103 with '[SEP]'
-            sentences_mask = sentences['bert'] == 103  # mask for all the SEP tokens in the batch
+            sentences_mask = sentences['bert']["token_ids"] == 103  # mask for all the SEP tokens in the batch
             embedded_sentences = embedded_sentences[sentences_mask]  # given batch_size x num_sentences_per_example x sent_len x vector_len
                                                                         # returns num_sentences_per_batch x vector_len
             assert embedded_sentences.dim() == 2
@@ -238,8 +238,8 @@ def get_metrics(self, reset: bool = False):
             average_F1 = 0.0
             for name, metric in self.label_f1_metrics.items():
                 metric_val = metric.get_metric(reset)
-                metric_dict[name + 'F'] = metric_val[2]
-                average_F1 += metric_val[2]
+                metric_dict[name + 'F'] = metric_val["f1"]
+                average_F1 += metric_val["f1"]
 
             average_F1 /= len(self.label_f1_metrics.items())
             metric_dict['avgF'] = average_F1

From 1186e3afa44de6bee70ebc52a0d0a37c699edeba Mon Sep 17 00:00:00 2001
From: UrszulaCzerwinska <ulcia.liberte@gmail.com>
Date: Fri, 12 Mar 2021 16:28:42 +0000
Subject: [PATCH 2/8] changed sep to a param; dealing with special tokesn in
 data reader

---
 scripts/train.sh                              |  1 +
 .../config.jsonnet                            |  1 +
 .../dataset_reader.py                         | 31 ++++++++++++++-----
 sequential_sentence_classification/model.py   | 14 +++------
 4 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/scripts/train.sh b/scripts/train.sh
index 74c3a62..6fd55af 100755
--- a/scripts/train.sh
+++ b/scripts/train.sh
@@ -14,6 +14,7 @@ export TEST_PATH=data/CSAbstruct/test.jsonl
 
 # model
 export USE_SEP=true  # true for our model. false for baseline
+export TOKEN=[SEP]
 export WITH_CRF=false  # CRF only works for the baseline
 
 # training params
diff --git a/sequential_sentence_classification/config.jsonnet b/sequential_sentence_classification/config.jsonnet
index 1635e2c..bc4069f 100644
--- a/sequential_sentence_classification/config.jsonnet
+++ b/sequential_sentence_classification/config.jsonnet
@@ -29,6 +29,7 @@ local boolToInt(s) =
         "sent_max_len": std.parseInt(std.extVar("SENT_MAX_LEN")),
         "max_sent_per_example": 10,
         "use_sep": stringToBool(std.extVar("USE_SEP")),
+        "intersentence_token":std.extVar("TOKEN"),
         "sci_sum": stringToBool(std.extVar("SCI_SUM")),
         "use_abstract_scores": stringToBool(std.extVar("USE_ABSTRACT_SCORES")),
         "sci_sum_fake_scores": stringToBool(std.extVar("SCI_SUM_FAKE_SCORES")),
diff --git a/sequential_sentence_classification/dataset_reader.py b/sequential_sentence_classification/dataset_reader.py
index f6772c5..f93dc46 100644
--- a/sequential_sentence_classification/dataset_reader.py
+++ b/sequential_sentence_classification/dataset_reader.py
@@ -51,6 +51,12 @@ def __init__(self,
         self.max_sent_per_example = max_sent_per_example
         self.use_abstract_scores = use_abstract_scores
         self.sci_sum_fake_scores = sci_sum_fake_scores
+        print("*********************************")
+        print("start token : ", self._tokenizer.sequence_pair_start_tokens)
+        print("middle token : ", self._tokenizer.sequence_pair_mid_tokens)
+        print("end token : ", self._tokenizer.sequence_pair_end_tokens)
+        print("*********************************")
+
 
 
     def _read(self, file_path: str):
@@ -185,15 +191,24 @@ def text_to_instance(self,
             assert len(sentences) == len(additional_features)
 
         if self.use_sep:
-            tokenized_sentences = [self._tokenizer.tokenize(s)[:self.sent_max_len] + [Token("[SEP]")] for s in sentences]
-            sentences = [list(itertools.chain.from_iterable(tokenized_sentences))[:-1]]
-        else:
             # Tokenize the sentences
-            sentences = [
-                self._tokenizer.tokenize(sentence_text)[:self.sent_max_len]
-                for sentence_text in sentences
-            ]
-
+            tokenized_sentences =[]
+            if len(self._tokenizer.tokenize(s)) > self.sent_max_len:
+                    tokenized_sentences.append(self._tokenizer.tokenize(s)[:self.sent_max_len]+self._tokenizer.sequence_pair_mid_tokens)
+                else:
+                    tokenized_sentences.append(self._tokenizer.tokenize(s))
+            sentences = [list(itertools.chain.from_iterable(tokenized_sentences))]
+    
+        else:
+            tok_sentences = []
+            for sentence_text in sentences:
+                if len(self._tokenizer.tokenize(sentence_text)) > self.sent_max_len:
+                    tok_sentences.append(self._tokenizer.tokenize(sentence_text)[:self.sent_max_len]+self._tokenizer.sequence_pair_end_tokens)
+                else:
+                    tok_sentences.append(self._tokenizer.tokenize(sentence_text))
+            
+            sentences = tok_sentences
+           
         fields: Dict[str, Field] = {}
         fields["sentences"] = ListField([
                 TextField(sentence)
diff --git a/sequential_sentence_classification/model.py b/sequential_sentence_classification/model.py
index cb9ff7e..f6e63ae 100644
--- a/sequential_sentence_classification/model.py
+++ b/sequential_sentence_classification/model.py
@@ -25,6 +25,7 @@ def __init__(self, vocab: Vocabulary,
                  self_attn: Seq2SeqEncoder = None,
                  bert_dropout: float = 0.1,
                  sci_sum: bool = False,
+                 intersentence_token: str = "[SEP]",
                  additional_feature_size: int = 0,
                  ) -> None:
         super(SeqClassificationModel, self).__init__(vocab)
@@ -36,7 +37,7 @@ def __init__(self, vocab: Vocabulary,
         self.sci_sum = sci_sum
         self.self_attn = self_attn
         self.additional_feature_size = additional_feature_size
-
+        self.token = intersentence_token
         self.dropout = torch.nn.Dropout(p=bert_dropout)
 
        # define loss
@@ -62,7 +63,7 @@ def __init__(self, vocab: Vocabulary,
         ff_in_dim = encoded_senetence_dim if self.use_sep else self_attn.get_output_dim()
         ff_in_dim += self.additional_feature_size
 
-        self.time_distributed_aggregate_feedforward = TimeDistributed(Linear(ff_in_dim, self.num_labels))
+        self.time_distributed_aggregate_feedforward = Linear(ff_in_dim, self.num_labels)
 
         if self.with_crf:
             self.crf = ConditionalRandomField(
@@ -82,14 +83,8 @@ def forward(self,  # type: ignore
         ----------
         TODO: add description
 
-        Returns
-        -------
-        An output dictionary consisting of:
-        loss : torch.FloatTensor, optional
-            A scalar loss to be optimised.
         """
         # ===========================================================================================================
-        # Layer 1: For each sentence, participant pair: create a Glove embedding for each token
         # Input: sentences
         # Output: embedded_sentences
 
@@ -102,7 +97,8 @@ def forward(self,  # type: ignore
             # The following code collects vectors of the SEP tokens from all the examples in the batch,
             # and arrange them in one list. It does the same for the labels and confidences.
             # TODO: replace 103 with '[SEP]'
-            sentences_mask = sentences['bert']["token_ids"] == 103  # mask for all the SEP tokens in the batch
+            index_sep = int(self.vocab.get_token_index(token=self.token, namespace = "tags"))
+            sentences_mask = sentences['bert']["token_ids"] == index_sep # mask for all the SEP tokens in the batch
             embedded_sentences = embedded_sentences[sentences_mask]  # given batch_size x num_sentences_per_example x sent_len x vector_len
                                                                         # returns num_sentences_per_batch x vector_len
             assert embedded_sentences.dim() == 2

From 81c249dd69616e3ed02fb127ad38d588636c3a8e Mon Sep 17 00:00:00 2001
From: UrszulaCzerwinska <ulcia.liberte@gmail.com>
Date: Fri, 12 Mar 2021 16:33:18 +0000
Subject: [PATCH 3/8] solved pb of cutting of sentences and labels number not
 matching

---
 .../dataset_reader.py                         | 31 ++++++++++++++-----
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/sequential_sentence_classification/dataset_reader.py b/sequential_sentence_classification/dataset_reader.py
index f93dc46..9351657 100644
--- a/sequential_sentence_classification/dataset_reader.py
+++ b/sequential_sentence_classification/dataset_reader.py
@@ -191,13 +191,17 @@ def text_to_instance(self,
             assert len(sentences) == len(additional_features)
 
         if self.use_sep:
-            # Tokenize the sentences
-            tokenized_sentences =[]
-            if len(self._tokenizer.tokenize(s)) > self.sent_max_len:
-                    tokenized_sentences.append(self._tokenizer.tokenize(s)[:self.sent_max_len]+self._tokenizer.sequence_pair_mid_tokens)
-                else:
-                    tokenized_sentences.append(self._tokenizer.tokenize(s))
-            sentences = [list(itertools.chain.from_iterable(tokenized_sentences))]
+            origin_sent = copy.deepcopy(sentences)
+            sentences = self.shorten_sentences(sentences, self.sent_max_len)
+    
+            max_len=self.sent_max_len
+            while len(sentences[0]) > 512:
+                n = int((len(sentences[0])-512)/ len(origin_sent))+1
+                
+                max_len -= n
+                sentences = self.shorten_sentences(origin_sent, max_len )
+              
+            assert len(sentences[0]) <= 512
     
         else:
             tok_sentences = []
@@ -239,4 +243,15 @@ def text_to_instance(self,
 
     def apply_token_indexers(self, instance: Instance) -> None:
         for text_field in instance["sentences"].field_list:
-            text_field.token_indexers = self._token_indexers
\ No newline at end of file
+            text_field.token_indexers = self._token_indexers
+
+     def shorten_sentences(self, origin_sent, max_len):
+        
+        tokenized_sentences = [self._tokenizer.sequence_pair_start_tokens]
+        for s in origin_sent:
+            if len(self._tokenizer.tokenize(s)) > (max_len):
+                tokenized_sentences.append(self._tokenizer.tokenize(s)[1:(max_len)]+self._tokenizer.sequence_pair_mid_tokens)
+            else:
+                tokenized_sentences.append(self._tokenizer.tokenize(s)[1:-1]+self._tokenizer.sequence_pair_mid_tokens)
+        mid_tok_len = len(self._tokenizer.sequence_pair_mid_tokens)
+        return [list(itertools.chain.from_iterable(tokenized_sentences))[:-mid_tok_len]+self._tokenizer.sequence_pair_end_tokens]
\ No newline at end of file

From 588ac809a22e84e8a1e4d2193900a4117ee7be46 Mon Sep 17 00:00:00 2001
From: UrszulaCzerwinska <ulcia.liberte@gmail.com>
Date: Fri, 12 Mar 2021 16:36:48 +0000
Subject: [PATCH 4/8] dealing with single sentence in sequence of sentences

---
 sequential_sentence_classification/model.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sequential_sentence_classification/model.py b/sequential_sentence_classification/model.py
index f6e63ae..1d33a2e 100644
--- a/sequential_sentence_classification/model.py
+++ b/sequential_sentence_classification/model.py
@@ -193,7 +193,10 @@ def forward(self,  # type: ignore
             flattened_gold = labels.contiguous().view(-1)
 
             if not self.with_crf:
-                label_loss = self.loss(flattened_logits.squeeze(), flattened_gold)
+                if flattened_logits.shape[0] == 1:
+                    label_loss = self.loss(flattened_logits, flattened_gold)
+                else:
+                    label_loss = self.loss(flattened_logits.squeeze(), flattened_gold)
                 if confidences is not None:
                     label_loss = label_loss * confidences.type_as(label_loss).view(-1)
                 label_loss = label_loss.mean()
@@ -211,7 +214,10 @@ def forward(self,  # type: ignore
 
             if not self.labels_are_scores:
                 evaluation_mask = (flattened_gold != -1)
-                self.label_accuracy(flattened_probs.float().contiguous(), flattened_gold.squeeze(-1), mask=evaluation_mask)
+                if flattened_probs.shape[0] == 1:
+                    self.label_accuracy(flattened_probs.float().contiguous(), flattened_gold, mask=evaluation_mask)
+                else:
+                    self.label_accuracy(flattened_probs.float().contiguous(), flattened_gold.squeeze(-1), mask=evaluation_mask)
 
                 # compute F1 per label
                 for label_index in range(self.num_labels):

From 850bff6f03d0d4f6c3ab9a3697aa62ea557bd078 Mon Sep 17 00:00:00 2001
From: UrszulaCzerwinska <ulcia.liberte@gmail.com>
Date: Fri, 12 Mar 2021 16:49:19 +0000
Subject: [PATCH 5/8] added roberta params and model

---
 scripts/train.sh                                  | 14 ++++++++++----
 sequential_sentence_classification/config.jsonnet |  1 +
 sequential_sentence_classification/model.py       |  9 +++++++++
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/scripts/train.sh b/scripts/train.sh
index 6fd55af..e41e7bd 100755
--- a/scripts/train.sh
+++ b/scripts/train.sh
@@ -4,8 +4,15 @@ export SEED=15270
 export PYTORCH_SEED=`expr $SEED / 10`
 export NUMPY_SEED=`expr $PYTORCH_SEED / 10`
 
-# path to bert vocab and weights
-export BERT_MODEL=allenai/scibert_scivocab_uncased
+# path to bert type and path
+# export BERT_MODEL=allenai/scibert_scivocab_uncased
+# export TOKEN= [SEP]
+# export MODEL_TYPE=bert
+
+export BERT_MODEL=roberta-base
+export TOKEN=</s>
+export MODEL_TYPE=roberta
+
 
 # path to dataset files
 export TRAIN_PATH=data/CSAbstruct/train.jsonl
@@ -14,12 +21,11 @@ export TEST_PATH=data/CSAbstruct/test.jsonl
 
 # model
 export USE_SEP=true  # true for our model. false for baseline
-export TOKEN=[SEP]
 export WITH_CRF=false  # CRF only works for the baseline
 
 # training params
 export cuda_device=0
-export BATCH_SIZE=4
+export BATCH_SIZE=1 # set one for roberta
 export LR=5e-5
 export TRAINING_DATA_INSTANCES=1668
 export NUM_EPOCHS=2
diff --git a/sequential_sentence_classification/config.jsonnet b/sequential_sentence_classification/config.jsonnet
index bc4069f..12e47f7 100644
--- a/sequential_sentence_classification/config.jsonnet
+++ b/sequential_sentence_classification/config.jsonnet
@@ -30,6 +30,7 @@ local boolToInt(s) =
         "max_sent_per_example": 10,
         "use_sep": stringToBool(std.extVar("USE_SEP")),
         "intersentence_token":std.extVar("TOKEN"),
+        "model_type":std.extVar("MODEL_TYPE"),
         "sci_sum": stringToBool(std.extVar("SCI_SUM")),
         "use_abstract_scores": stringToBool(std.extVar("USE_ABSTRACT_SCORES")),
         "sci_sum_fake_scores": stringToBool(std.extVar("SCI_SUM_FAKE_SCORES")),
diff --git a/sequential_sentence_classification/model.py b/sequential_sentence_classification/model.py
index 1d33a2e..699c151 100644
--- a/sequential_sentence_classification/model.py
+++ b/sequential_sentence_classification/model.py
@@ -26,6 +26,7 @@ def __init__(self, vocab: Vocabulary,
                  bert_dropout: float = 0.1,
                  sci_sum: bool = False,
                  intersentence_token: str = "[SEP]",
+                 model_type: str = "bert",
                  additional_feature_size: int = 0,
                  ) -> None:
         super(SeqClassificationModel, self).__init__(vocab)
@@ -101,6 +102,14 @@ def forward(self,  # type: ignore
             sentences_mask = sentences['bert']["token_ids"] == index_sep # mask for all the SEP tokens in the batch
             embedded_sentences = embedded_sentences[sentences_mask]  # given batch_size x num_sentences_per_example x sent_len x vector_len
                                                                         # returns num_sentences_per_batch x vector_len
+            ## roberta only WORKS ONLY IF BATCH SIZE == 1
+            if self.model_type == "roberta" :            
+                assert batch_size == 1, "set batch size to 1 for RoBERTa"                                               
+                indx = np.arange(embedded_sentences.shape[0])
+                device = "cuda" 
+                sel_idx = torch.from_numpy(indx[indx%2==0]).to(device)# select only scond intersentence marker
+                embedded_sentences = torch.index_select(embedded_sentences, 0, sel_idx)
+            
             assert embedded_sentences.dim() == 2
             num_sentences = embedded_sentences.shape[0]
             # for the rest of the code in this model to work, think of the data we have as one example

From 33716f2fb22e7e2dc36b26aa617237d4482d3808 Mon Sep 17 00:00:00 2001
From: UrszulaCzerwinska <ulcia.liberte@gmail.com>
Date: Fri, 12 Mar 2021 17:28:33 +0000
Subject: [PATCH 6/8] fixed typos

---
 scripts/train.sh                                  |  8 ++++----
 sequential_sentence_classification/config.jsonnet | 15 +++++++--------
 .../dataset_reader.py                             |  8 ++++----
 sequential_sentence_classification/model.py       |  2 ++
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/scripts/train.sh b/scripts/train.sh
index e41e7bd..0c87b90 100755
--- a/scripts/train.sh
+++ b/scripts/train.sh
@@ -10,10 +10,9 @@ export NUMPY_SEED=`expr $PYTORCH_SEED / 10`
 # export MODEL_TYPE=bert
 
 export BERT_MODEL=roberta-base
-export TOKEN=</s>
+export TOKEN="</s>"
 export MODEL_TYPE=roberta
 
-
 # path to dataset files
 export TRAIN_PATH=data/CSAbstruct/train.jsonl
 export DEV_PATH=data/CSAbstruct/dev.jsonl
@@ -27,7 +26,8 @@ export WITH_CRF=false  # CRF only works for the baseline
 export cuda_device=0
 export BATCH_SIZE=1 # set one for roberta
 export LR=5e-5
-export TRAINING_DATA_INSTANCES=1668
+#export TRAINING_DATA_INSTANCES=1668
+export TRAINING_STEPS=52
 export NUM_EPOCHS=2
 
 # limit number of sentneces per examples, and number of words per sentence. This is dataset dependant
@@ -41,4 +41,4 @@ export SCI_SUM_FAKE_SCORES=false  # use fake scores for testing
 
 CONFIG_FILE=sequential_sentence_classification/config.jsonnet
 
-python -m allennlp train $CONFIG_FILE  --include-package sequential_sentence_classification -s $SERIALIZATION_DIR "$@"
+python3 -m allennlp train $CONFIG_FILE  --include-package sequential_sentence_classification -s $SERIALIZATION_DIR "$@"
diff --git a/sequential_sentence_classification/config.jsonnet b/sequential_sentence_classification/config.jsonnet
index 12e47f7..6c9414f 100644
--- a/sequential_sentence_classification/config.jsonnet
+++ b/sequential_sentence_classification/config.jsonnet
@@ -29,8 +29,6 @@ local boolToInt(s) =
         "sent_max_len": std.parseInt(std.extVar("SENT_MAX_LEN")),
         "max_sent_per_example": 10,
         "use_sep": stringToBool(std.extVar("USE_SEP")),
-        "intersentence_token":std.extVar("TOKEN"),
-        "model_type":std.extVar("MODEL_TYPE"),
         "sci_sum": stringToBool(std.extVar("SCI_SUM")),
         "use_abstract_scores": stringToBool(std.extVar("USE_ABSTRACT_SCORES")),
         "sci_sum_fake_scores": stringToBool(std.extVar("SCI_SUM_FAKE_SCORES")),
@@ -52,8 +50,10 @@ local boolToInt(s) =
         }
         }
     },
-    "use_sep": std.extVar("USE_SEP"),
-    "with_crf": std.extVar("WITH_CRF"),
+    "use_sep": stringToBool(std.extVar("USE_SEP")),
+    "with_crf": stringToBool(std.extVar("WITH_CRF")),
+    "intersentence_token":std.extVar("TOKEN"),
+    "model_type":std.extVar("MODEL_TYPE"),
     "bert_dropout": 0.1,
     "sci_sum": stringToBool(std.extVar("SCI_SUM")),
     "additional_feature_size": boolToInt(stringToBool(std.extVar("USE_ABSTRACT_SCORES"))),
@@ -68,15 +68,14 @@ local boolToInt(s) =
   "data_loader": {
         "batch_size": std.parseInt(std.extVar("BATCH_SIZE")),
         "shuffle": false,
-  }
+  },
   "trainer": {
     "num_epochs": std.parseInt(std.extVar("NUM_EPOCHS")),
     "grad_clipping": 1.0,
     "patience": 5,
-    "model_save_interval": 3600,
     "validation_metric": if stringToBool(std.extVar("SCI_SUM")) then "-loss" else '+acc',
     "cuda_device": std.parseInt(std.extVar("cuda_device")),
-    "gradient_accumulation_batch_size": 32,
+    "num_gradient_accumulation_steps": 32,
     "optimizer": {
       "type": "huggingface_adamw",
       "lr": std.parseJson(std.extVar("LR")),
@@ -85,7 +84,7 @@ local boolToInt(s) =
     "learning_rate_scheduler": {
       "type": "slanted_triangular",
       "num_epochs": std.parseInt(std.extVar("NUM_EPOCHS")),
-      "num_steps_per_epoch": std.parseInt(std.extVar("TRAINING_DATA_INSTANCES")) / 32,
+      "num_steps_per_epoch": std.parseInt(std.extVar("TRAINING_STEPS")),
       "cut_frac": 0.1,
     },
   }
diff --git a/sequential_sentence_classification/dataset_reader.py b/sequential_sentence_classification/dataset_reader.py
index 9351657..7cbf970 100644
--- a/sequential_sentence_classification/dataset_reader.py
+++ b/sequential_sentence_classification/dataset_reader.py
@@ -4,6 +4,7 @@
 from overrides import overrides
 
 import numpy as np
+import copy
 
 from allennlp.data.dataset_readers.dataset_reader import DatasetReader
 from allennlp.common.file_utils import cached_path
@@ -13,7 +14,7 @@
 from allennlp.data.fields import TextField, LabelField, ListField, ArrayField, MultiLabelField
 from allennlp.data.token_indexers import SingleIdTokenIndexer
 from allennlp.data.tokenizers import WhitespaceTokenizer
-from allennlp.data.tokenizers.token import Token
+from allennlp.data.tokenizers.token_class import Token
 
 
 @DatasetReader.register("SeqClassificationReader")
@@ -41,7 +42,7 @@ def __init__(self,
                  predict: bool = False,
                  ) -> None:
         super().__init__(manual_distributed_sharding=True,
-            manual_multiprocess_sharding=True, **kwargs)
+            manual_multiprocess_sharding=True)
         self._tokenizer = tokenizer or WhitespaceTokenizer()
         self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
         self.sent_max_len = sent_max_len
@@ -245,8 +246,7 @@ def apply_token_indexers(self, instance: Instance) -> None:
         for text_field in instance["sentences"].field_list:
             text_field.token_indexers = self._token_indexers
 
-     def shorten_sentences(self, origin_sent, max_len):
-        
+    def shorten_sentences(self, origin_sent, max_len):
         tokenized_sentences = [self._tokenizer.sequence_pair_start_tokens]
         for s in origin_sent:
             if len(self._tokenizer.tokenize(s)) > (max_len):
diff --git a/sequential_sentence_classification/model.py b/sequential_sentence_classification/model.py
index 699c151..1acb90b 100644
--- a/sequential_sentence_classification/model.py
+++ b/sequential_sentence_classification/model.py
@@ -1,6 +1,7 @@
 import logging
 from typing import Dict
 
+import numpy as np
 import torch
 from torch.nn import Linear
 from allennlp.data import Vocabulary
@@ -39,6 +40,7 @@ def __init__(self, vocab: Vocabulary,
         self.self_attn = self_attn
         self.additional_feature_size = additional_feature_size
         self.token = intersentence_token
+        self.model_type = model_type
         self.dropout = torch.nn.Dropout(p=bert_dropout)
 
        # define loss

From 9cc900bba596cb689a5ef4188d32e9507e10628c Mon Sep 17 00:00:00 2001
From: Urszula Czerwinska <ulcia.liberte@gmail.com>
Date: Mon, 15 Mar 2021 12:03:42 +0100
Subject: [PATCH 7/8] update train.sh

 best params bert sep
---
 scripts/train.sh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scripts/train.sh b/scripts/train.sh
index 0c87b90..75dfe3b 100755
--- a/scripts/train.sh
+++ b/scripts/train.sh
@@ -5,13 +5,13 @@ export PYTORCH_SEED=`expr $SEED / 10`
 export NUMPY_SEED=`expr $PYTORCH_SEED / 10`
 
 # path to bert type and path
-# export BERT_MODEL=allenai/scibert_scivocab_uncased
-# export TOKEN= [SEP]
-# export MODEL_TYPE=bert
+export BERT_MODEL=allenai/scibert_scivocab_uncased
+export TOKEN=[SEP]
+export MODEL_TYPE=bert
 
-export BERT_MODEL=roberta-base
-export TOKEN="</s>"
-export MODEL_TYPE=roberta
+# export BERT_MODEL=roberta-base
+# export TOKEN="</s>"
+# export MODEL_TYPE=roberta
 
 # path to dataset files
 export TRAIN_PATH=data/CSAbstruct/train.jsonl
@@ -24,11 +24,11 @@ export WITH_CRF=false  # CRF only works for the baseline
 
 # training params
 export cuda_device=0
-export BATCH_SIZE=1 # set one for roberta
-export LR=5e-5
+export BATCH_SIZE=4 # set one for roberta
+export LR=1e-5
 #export TRAINING_DATA_INSTANCES=1668
 export TRAINING_STEPS=52
-export NUM_EPOCHS=2
+export NUM_EPOCHS=20
 
 # limit number of sentneces per examples, and number of words per sentence. This is dataset dependant
 export MAX_SENT_PER_EXAMPLE=10

From 96de0eda7bae96e9a72894966eca37f7e8e321c3 Mon Sep 17 00:00:00 2001
From: Urszula Czerwinska <ulcia.liberte@gmail.com>
Date: Mon, 15 Mar 2021 12:04:29 +0100
Subject: [PATCH 8/8] update config.jsonnet

shuffle before train
---
 sequential_sentence_classification/config.jsonnet | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sequential_sentence_classification/config.jsonnet b/sequential_sentence_classification/config.jsonnet
index 6c9414f..987875c 100644
--- a/sequential_sentence_classification/config.jsonnet
+++ b/sequential_sentence_classification/config.jsonnet
@@ -67,7 +67,7 @@ local boolToInt(s) =
   },
   "data_loader": {
         "batch_size": std.parseInt(std.extVar("BATCH_SIZE")),
-        "shuffle": false,
+        "shuffle": true,
   },
   "trainer": {
     "num_epochs": std.parseInt(std.extVar("NUM_EPOCHS")),
@@ -88,4 +88,4 @@ local boolToInt(s) =
       "cut_frac": 0.1,
     },
   }
-}
\ No newline at end of file
+}