From 348f6f51510c7b1dc864ceda797435e288f23f91 Mon Sep 17 00:00:00 2001 From: UrszulaCzerwinska Date: Fri, 12 Mar 2021 16:19:01 +0000 Subject: [PATCH 1/8] changing config and classes construction for allennlp2 --- scripts/train.sh | 5 +- .../config.jsonnet | 84 ++++++++----------- .../dataset_reader.py | 27 +++--- sequential_sentence_classification/model.py | 10 +-- 4 files changed, 55 insertions(+), 71 deletions(-) diff --git a/scripts/train.sh b/scripts/train.sh index 6ed1c61..74c3a62 100755 --- a/scripts/train.sh +++ b/scripts/train.sh @@ -5,8 +5,7 @@ export PYTORCH_SEED=`expr $SEED / 10` export NUMPY_SEED=`expr $PYTORCH_SEED / 10` # path to bert vocab and weights -export BERT_VOCAB=https://ai2-s2-research.s3-us-west-2.amazonaws.com/scibert/allennlp_files/scivocab_uncased.vocab -export BERT_WEIGHTS=https://ai2-s2-research.s3-us-west-2.amazonaws.com/scibert/allennlp_files/scibert_scivocab_uncased.tar.gz +export BERT_MODEL=allenai/scibert_scivocab_uncased # path to dataset files export TRAIN_PATH=data/CSAbstruct/train.jsonl @@ -35,4 +34,4 @@ export SCI_SUM_FAKE_SCORES=false # use fake scores for testing CONFIG_FILE=sequential_sentence_classification/config.jsonnet -python -m allennlp.run train $CONFIG_FILE --include-package sequential_sentence_classification -s $SERIALIZATION_DIR "$@" +python -m allennlp train $CONFIG_FILE --include-package sequential_sentence_classification -s $SERIALIZATION_DIR "$@" diff --git a/sequential_sentence_classification/config.jsonnet b/sequential_sentence_classification/config.jsonnet index d42b0c8..1635e2c 100644 --- a/sequential_sentence_classification/config.jsonnet +++ b/sequential_sentence_classification/config.jsonnet @@ -12,45 +12,42 @@ local boolToInt(s) = "random_seed": std.parseInt(std.extVar("SEED")), "pytorch_seed": std.parseInt(std.extVar("PYTORCH_SEED")), "numpy_seed": std.parseInt(std.extVar("NUMPY_SEED")), - "dataset_reader":{ - "type":"SeqClassificationReader", - "lazy": false, - "sent_max_len": std.extVar("SENT_MAX_LEN"), - "word_splitter": "bert-basic", - "max_sent_per_example": std.extVar("MAX_SENT_PER_EXAMPLE"), - "token_indexers": { - "bert": { - "type": "bert-pretrained", - "pretrained_model": std.extVar("BERT_VOCAB"), - "do_lowercase": true, - "use_starting_offsets": false - }, + "dataset_reader" : { + "type": "SeqClassificationReader", + "tokenizer": { + "type": "pretrained_transformer", + "model_name": std.extVar("BERT_MODEL"), + "tokenizer_kwargs": {"truncation_strategy" : 'do_not_truncate'}, + }, + "token_indexers": { + "bert": { + "type": "pretrained_transformer", + "model_name": std.extVar("BERT_MODEL"), + "tokenizer_kwargs": {"truncation_strategy" : 'do_not_truncate'}, + } + }, + "sent_max_len": std.parseInt(std.extVar("SENT_MAX_LEN")), + "max_sent_per_example": 10, + "use_sep": stringToBool(std.extVar("USE_SEP")), + "sci_sum": stringToBool(std.extVar("SCI_SUM")), + "use_abstract_scores": stringToBool(std.extVar("USE_ABSTRACT_SCORES")), + "sci_sum_fake_scores": stringToBool(std.extVar("SCI_SUM_FAKE_SCORES")), }, - "use_sep": std.extVar("USE_SEP"), - "sci_sum": stringToBool(std.extVar("SCI_SUM")), - "use_abstract_scores": stringToBool(std.extVar("USE_ABSTRACT_SCORES")), - "sci_sum_fake_scores": stringToBool(std.extVar("SCI_SUM_FAKE_SCORES")), - }, - "train_data_path": std.extVar("TRAIN_PATH"), "validation_data_path": std.extVar("DEV_PATH"), "test_data_path": std.extVar("TEST_PATH"), "evaluate_on_test": true, "model": { "type": "SeqClassificationModel", - "text_field_embedder": { - "allow_unmatched_keys": true, - "embedder_to_indexer_map": { - "bert": if stringToBool(std.extVar("USE_SEP")) then ["bert"] else ["bert", "bert-offsets"], - "tokens": ["tokens"], - }, + "text_field_embedder": { "token_embedders": { "bert": { - "type": "bert-pretrained", - "pretrained_model": std.extVar("BERT_WEIGHTS"), - "requires_grad": 'all', - "top_layer_only": false, - } + "type": "pretrained_transformer", + "model_name": std.extVar("BERT_MODEL"), + "train_parameters": 1, + "last_layer_only": 1, + + } } }, "use_sep": std.extVar("USE_SEP"), @@ -59,43 +56,30 @@ local boolToInt(s) = "sci_sum": stringToBool(std.extVar("SCI_SUM")), "additional_feature_size": boolToInt(stringToBool(std.extVar("USE_ABSTRACT_SCORES"))), "self_attn": { - "type": "stacked_self_attention", + "type": "pytorch_transformer", "input_dim": 768, - "projection_dim": 100, "feedforward_hidden_dim": 50, "num_layers": 2, "num_attention_heads": 2, - "hidden_dim": 100, }, }, - "iterator": { - "type": "bucket", - "sorting_keys": [["sentences", "num_fields"]], - "batch_size" : std.parseInt(std.extVar("BATCH_SIZE")), - "cache_instances": true, - "biggest_batch_first": true - }, - + "data_loader": { + "batch_size": std.parseInt(std.extVar("BATCH_SIZE")), + "shuffle": false, + } "trainer": { "num_epochs": std.parseInt(std.extVar("NUM_EPOCHS")), "grad_clipping": 1.0, "patience": 5, "model_save_interval": 3600, "validation_metric": if stringToBool(std.extVar("SCI_SUM")) then "-loss" else '+acc', - "min_delta": 0.001, "cuda_device": std.parseInt(std.extVar("cuda_device")), "gradient_accumulation_batch_size": 32, "optimizer": { - "type": "bert_adam", - "lr": std.extVar("LR"), - "t_total": -1, - "max_grad_norm": 1.0, + "type": "huggingface_adamw", + "lr": std.parseJson(std.extVar("LR")), "weight_decay": 0.01, - "parameter_groups": [ - [["bias", "LayerNorm.bias", "LayerNorm.weight", "layer_norm.weight"], {"weight_decay": 0.0}], - ], }, - "should_log_learning_rate": true, "learning_rate_scheduler": { "type": "slanted_triangular", "num_epochs": std.parseInt(std.extVar("NUM_EPOCHS")), diff --git a/sequential_sentence_classification/dataset_reader.py b/sequential_sentence_classification/dataset_reader.py index 789ecf2..f6772c5 100644 --- a/sequential_sentence_classification/dataset_reader.py +++ b/sequential_sentence_classification/dataset_reader.py @@ -7,14 +7,13 @@ from allennlp.data.dataset_readers.dataset_reader import DatasetReader from allennlp.common.file_utils import cached_path -from allennlp.data import Tokenizer +from allennlp.data import TokenIndexer, Tokenizer from allennlp.data.instance import Instance from allennlp.data.fields.field import Field from allennlp.data.fields import TextField, LabelField, ListField, ArrayField, MultiLabelField -from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer -from allennlp.data.tokenizers import WordTokenizer +from allennlp.data.token_indexers import SingleIdTokenIndexer +from allennlp.data.tokenizers import WhitespaceTokenizer from allennlp.data.tokenizers.token import Token -from allennlp.data.tokenizers.word_splitter import SimpleWordSplitter, WordSplitter, SpacyWordSplitter @DatasetReader.register("SeqClassificationReader") @@ -31,9 +30,7 @@ class SeqClassificationReader(DatasetReader): """ def __init__(self, - lazy: bool = False, token_indexers: Dict[str, TokenIndexer] = None, - word_splitter: WordSplitter = None, tokenizer: Tokenizer = None, sent_max_len: int = 100, max_sent_per_example: int = 20, @@ -43,8 +40,9 @@ def __init__(self, sci_sum_fake_scores: bool = True, predict: bool = False, ) -> None: - super().__init__(lazy) - self._tokenizer = WordTokenizer(word_splitter=SpacyWordSplitter(pos_tags=False)) + super().__init__(manual_distributed_sharding=True, + manual_multiprocess_sharding=True, **kwargs) + self._tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self.sent_max_len = sent_max_len self.use_sep = use_sep @@ -54,12 +52,12 @@ def __init__(self, self.use_abstract_scores = use_abstract_scores self.sci_sum_fake_scores = sci_sum_fake_scores - @overrides + def _read(self, file_path: str): file_path = cached_path(file_path) with open(file_path) as f: - for line in f: + for line in self.shard_iterable(f): json_dict = json.loads(line) instances = self.read_one_example(json_dict) for instance in instances: @@ -173,7 +171,6 @@ def filter_bad_sci_sum_sentences(self, sentences, labels): return sentences, labels - @overrides def text_to_instance(self, sentences: List[str], labels: List[str] = None, @@ -199,7 +196,7 @@ def text_to_instance(self, fields: Dict[str, Field] = {} fields["sentences"] = ListField([ - TextField(sentence, self._token_indexers) + TextField(sentence) for sentence in sentences ]) @@ -223,4 +220,8 @@ def text_to_instance(self, if additional_features is not None: fields["additional_features"] = ArrayField(np.array(additional_features)) - return Instance(fields) \ No newline at end of file + return Instance(fields) + + def apply_token_indexers(self, instance: Instance) -> None: + for text_field in instance["sentences"].field_list: + text_field.token_indexers = self._token_indexers \ No newline at end of file diff --git a/sequential_sentence_classification/model.py b/sequential_sentence_classification/model.py index 6e7e9f8..cb9ff7e 100644 --- a/sequential_sentence_classification/model.py +++ b/sequential_sentence_classification/model.py @@ -57,7 +57,7 @@ def __init__(self, vocab: Vocabulary, label_name = self.vocab.get_token_from_index(namespace='labels', index=label_index) self.label_f1_metrics[label_name] = F1Measure(label_index) - encoded_senetence_dim = text_field_embedder._token_embedders['bert'].output_dim + encoded_senetence_dim = text_field_embedder._token_embedders['bert'].get_output_dim() ff_in_dim = encoded_senetence_dim if self.use_sep else self_attn.get_output_dim() ff_in_dim += self.additional_feature_size @@ -94,7 +94,7 @@ def forward(self, # type: ignore # Output: embedded_sentences # embedded_sentences: batch_size, num_sentences, sentence_length, embedding_size - embedded_sentences = self.text_field_embedder(sentences) + embedded_sentences = self.text_field_embedder(sentences, num_wrapping_dims= 1) mask = get_text_field_mask(sentences, num_wrapping_dims=1).float() batch_size, num_sentences, _, _ = embedded_sentences.size() @@ -102,7 +102,7 @@ def forward(self, # type: ignore # The following code collects vectors of the SEP tokens from all the examples in the batch, # and arrange them in one list. It does the same for the labels and confidences. # TODO: replace 103 with '[SEP]' - sentences_mask = sentences['bert'] == 103 # mask for all the SEP tokens in the batch + sentences_mask = sentences['bert']["token_ids"] == 103 # mask for all the SEP tokens in the batch embedded_sentences = embedded_sentences[sentences_mask] # given batch_size x num_sentences_per_example x sent_len x vector_len # returns num_sentences_per_batch x vector_len assert embedded_sentences.dim() == 2 @@ -238,8 +238,8 @@ def get_metrics(self, reset: bool = False): average_F1 = 0.0 for name, metric in self.label_f1_metrics.items(): metric_val = metric.get_metric(reset) - metric_dict[name + 'F'] = metric_val[2] - average_F1 += metric_val[2] + metric_dict[name + 'F'] = metric_val["f1"] + average_F1 += metric_val["f1"] average_F1 /= len(self.label_f1_metrics.items()) metric_dict['avgF'] = average_F1 From 1186e3afa44de6bee70ebc52a0d0a37c699edeba Mon Sep 17 00:00:00 2001 From: UrszulaCzerwinska Date: Fri, 12 Mar 2021 16:28:42 +0000 Subject: [PATCH 2/8] changed sep to a param; dealing with special tokesn in data reader --- scripts/train.sh | 1 + .../config.jsonnet | 1 + .../dataset_reader.py | 31 ++++++++++++++----- sequential_sentence_classification/model.py | 14 +++------ 4 files changed, 30 insertions(+), 17 deletions(-) diff --git a/scripts/train.sh b/scripts/train.sh index 74c3a62..6fd55af 100755 --- a/scripts/train.sh +++ b/scripts/train.sh @@ -14,6 +14,7 @@ export TEST_PATH=data/CSAbstruct/test.jsonl # model export USE_SEP=true # true for our model. false for baseline +export TOKEN=[SEP] export WITH_CRF=false # CRF only works for the baseline # training params diff --git a/sequential_sentence_classification/config.jsonnet b/sequential_sentence_classification/config.jsonnet index 1635e2c..bc4069f 100644 --- a/sequential_sentence_classification/config.jsonnet +++ b/sequential_sentence_classification/config.jsonnet @@ -29,6 +29,7 @@ local boolToInt(s) = "sent_max_len": std.parseInt(std.extVar("SENT_MAX_LEN")), "max_sent_per_example": 10, "use_sep": stringToBool(std.extVar("USE_SEP")), + "intersentence_token":std.extVar("TOKEN"), "sci_sum": stringToBool(std.extVar("SCI_SUM")), "use_abstract_scores": stringToBool(std.extVar("USE_ABSTRACT_SCORES")), "sci_sum_fake_scores": stringToBool(std.extVar("SCI_SUM_FAKE_SCORES")), diff --git a/sequential_sentence_classification/dataset_reader.py b/sequential_sentence_classification/dataset_reader.py index f6772c5..f93dc46 100644 --- a/sequential_sentence_classification/dataset_reader.py +++ b/sequential_sentence_classification/dataset_reader.py @@ -51,6 +51,12 @@ def __init__(self, self.max_sent_per_example = max_sent_per_example self.use_abstract_scores = use_abstract_scores self.sci_sum_fake_scores = sci_sum_fake_scores + print("*********************************") + print("start token : ", self._tokenizer.sequence_pair_start_tokens) + print("middle token : ", self._tokenizer.sequence_pair_mid_tokens) + print("end token : ", self._tokenizer.sequence_pair_end_tokens) + print("*********************************") + def _read(self, file_path: str): @@ -185,15 +191,24 @@ def text_to_instance(self, assert len(sentences) == len(additional_features) if self.use_sep: - tokenized_sentences = [self._tokenizer.tokenize(s)[:self.sent_max_len] + [Token("[SEP]")] for s in sentences] - sentences = [list(itertools.chain.from_iterable(tokenized_sentences))[:-1]] - else: # Tokenize the sentences - sentences = [ - self._tokenizer.tokenize(sentence_text)[:self.sent_max_len] - for sentence_text in sentences - ] - + tokenized_sentences =[] + if len(self._tokenizer.tokenize(s)) > self.sent_max_len: + tokenized_sentences.append(self._tokenizer.tokenize(s)[:self.sent_max_len]+self._tokenizer.sequence_pair_mid_tokens) + else: + tokenized_sentences.append(self._tokenizer.tokenize(s)) + sentences = [list(itertools.chain.from_iterable(tokenized_sentences))] + + else: + tok_sentences = [] + for sentence_text in sentences: + if len(self._tokenizer.tokenize(sentence_text)) > self.sent_max_len: + tok_sentences.append(self._tokenizer.tokenize(sentence_text)[:self.sent_max_len]+self._tokenizer.sequence_pair_end_tokens) + else: + tok_sentences.append(self._tokenizer.tokenize(sentence_text)) + + sentences = tok_sentences + fields: Dict[str, Field] = {} fields["sentences"] = ListField([ TextField(sentence) diff --git a/sequential_sentence_classification/model.py b/sequential_sentence_classification/model.py index cb9ff7e..f6e63ae 100644 --- a/sequential_sentence_classification/model.py +++ b/sequential_sentence_classification/model.py @@ -25,6 +25,7 @@ def __init__(self, vocab: Vocabulary, self_attn: Seq2SeqEncoder = None, bert_dropout: float = 0.1, sci_sum: bool = False, + intersentence_token: str = "[SEP]", additional_feature_size: int = 0, ) -> None: super(SeqClassificationModel, self).__init__(vocab) @@ -36,7 +37,7 @@ def __init__(self, vocab: Vocabulary, self.sci_sum = sci_sum self.self_attn = self_attn self.additional_feature_size = additional_feature_size - + self.token = intersentence_token self.dropout = torch.nn.Dropout(p=bert_dropout) # define loss @@ -62,7 +63,7 @@ def __init__(self, vocab: Vocabulary, ff_in_dim = encoded_senetence_dim if self.use_sep else self_attn.get_output_dim() ff_in_dim += self.additional_feature_size - self.time_distributed_aggregate_feedforward = TimeDistributed(Linear(ff_in_dim, self.num_labels)) + self.time_distributed_aggregate_feedforward = Linear(ff_in_dim, self.num_labels) if self.with_crf: self.crf = ConditionalRandomField( @@ -82,14 +83,8 @@ def forward(self, # type: ignore ---------- TODO: add description - Returns - ------- - An output dictionary consisting of: - loss : torch.FloatTensor, optional - A scalar loss to be optimised. """ # =========================================================================================================== - # Layer 1: For each sentence, participant pair: create a Glove embedding for each token # Input: sentences # Output: embedded_sentences @@ -102,7 +97,8 @@ def forward(self, # type: ignore # The following code collects vectors of the SEP tokens from all the examples in the batch, # and arrange them in one list. It does the same for the labels and confidences. # TODO: replace 103 with '[SEP]' - sentences_mask = sentences['bert']["token_ids"] == 103 # mask for all the SEP tokens in the batch + index_sep = int(self.vocab.get_token_index(token=self.token, namespace = "tags")) + sentences_mask = sentences['bert']["token_ids"] == index_sep # mask for all the SEP tokens in the batch embedded_sentences = embedded_sentences[sentences_mask] # given batch_size x num_sentences_per_example x sent_len x vector_len # returns num_sentences_per_batch x vector_len assert embedded_sentences.dim() == 2 From 81c249dd69616e3ed02fb127ad38d588636c3a8e Mon Sep 17 00:00:00 2001 From: UrszulaCzerwinska Date: Fri, 12 Mar 2021 16:33:18 +0000 Subject: [PATCH 3/8] solved pb of cutting of sentences and labels number not matching --- .../dataset_reader.py | 31 ++++++++++++++----- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/sequential_sentence_classification/dataset_reader.py b/sequential_sentence_classification/dataset_reader.py index f93dc46..9351657 100644 --- a/sequential_sentence_classification/dataset_reader.py +++ b/sequential_sentence_classification/dataset_reader.py @@ -191,13 +191,17 @@ def text_to_instance(self, assert len(sentences) == len(additional_features) if self.use_sep: - # Tokenize the sentences - tokenized_sentences =[] - if len(self._tokenizer.tokenize(s)) > self.sent_max_len: - tokenized_sentences.append(self._tokenizer.tokenize(s)[:self.sent_max_len]+self._tokenizer.sequence_pair_mid_tokens) - else: - tokenized_sentences.append(self._tokenizer.tokenize(s)) - sentences = [list(itertools.chain.from_iterable(tokenized_sentences))] + origin_sent = copy.deepcopy(sentences) + sentences = self.shorten_sentences(sentences, self.sent_max_len) + + max_len=self.sent_max_len + while len(sentences[0]) > 512: + n = int((len(sentences[0])-512)/ len(origin_sent))+1 + + max_len -= n + sentences = self.shorten_sentences(origin_sent, max_len ) + + assert len(sentences[0]) <= 512 else: tok_sentences = [] @@ -239,4 +243,15 @@ def text_to_instance(self, def apply_token_indexers(self, instance: Instance) -> None: for text_field in instance["sentences"].field_list: - text_field.token_indexers = self._token_indexers \ No newline at end of file + text_field.token_indexers = self._token_indexers + + def shorten_sentences(self, origin_sent, max_len): + + tokenized_sentences = [self._tokenizer.sequence_pair_start_tokens] + for s in origin_sent: + if len(self._tokenizer.tokenize(s)) > (max_len): + tokenized_sentences.append(self._tokenizer.tokenize(s)[1:(max_len)]+self._tokenizer.sequence_pair_mid_tokens) + else: + tokenized_sentences.append(self._tokenizer.tokenize(s)[1:-1]+self._tokenizer.sequence_pair_mid_tokens) + mid_tok_len = len(self._tokenizer.sequence_pair_mid_tokens) + return [list(itertools.chain.from_iterable(tokenized_sentences))[:-mid_tok_len]+self._tokenizer.sequence_pair_end_tokens] \ No newline at end of file From 588ac809a22e84e8a1e4d2193900a4117ee7be46 Mon Sep 17 00:00:00 2001 From: UrszulaCzerwinska Date: Fri, 12 Mar 2021 16:36:48 +0000 Subject: [PATCH 4/8] dealing with single sentence in sequence of sentences --- sequential_sentence_classification/model.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sequential_sentence_classification/model.py b/sequential_sentence_classification/model.py index f6e63ae..1d33a2e 100644 --- a/sequential_sentence_classification/model.py +++ b/sequential_sentence_classification/model.py @@ -193,7 +193,10 @@ def forward(self, # type: ignore flattened_gold = labels.contiguous().view(-1) if not self.with_crf: - label_loss = self.loss(flattened_logits.squeeze(), flattened_gold) + if flattened_logits.shape[0] == 1: + label_loss = self.loss(flattened_logits, flattened_gold) + else: + label_loss = self.loss(flattened_logits.squeeze(), flattened_gold) if confidences is not None: label_loss = label_loss * confidences.type_as(label_loss).view(-1) label_loss = label_loss.mean() @@ -211,7 +214,10 @@ def forward(self, # type: ignore if not self.labels_are_scores: evaluation_mask = (flattened_gold != -1) - self.label_accuracy(flattened_probs.float().contiguous(), flattened_gold.squeeze(-1), mask=evaluation_mask) + if flattened_probs.shape[0] == 1: + self.label_accuracy(flattened_probs.float().contiguous(), flattened_gold, mask=evaluation_mask) + else: + self.label_accuracy(flattened_probs.float().contiguous(), flattened_gold.squeeze(-1), mask=evaluation_mask) # compute F1 per label for label_index in range(self.num_labels): From 850bff6f03d0d4f6c3ab9a3697aa62ea557bd078 Mon Sep 17 00:00:00 2001 From: UrszulaCzerwinska Date: Fri, 12 Mar 2021 16:49:19 +0000 Subject: [PATCH 5/8] added roberta params and model --- scripts/train.sh | 14 ++++++++++---- sequential_sentence_classification/config.jsonnet | 1 + sequential_sentence_classification/model.py | 9 +++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/scripts/train.sh b/scripts/train.sh index 6fd55af..e41e7bd 100755 --- a/scripts/train.sh +++ b/scripts/train.sh @@ -4,8 +4,15 @@ export SEED=15270 export PYTORCH_SEED=`expr $SEED / 10` export NUMPY_SEED=`expr $PYTORCH_SEED / 10` -# path to bert vocab and weights -export BERT_MODEL=allenai/scibert_scivocab_uncased +# path to bert type and path +# export BERT_MODEL=allenai/scibert_scivocab_uncased +# export TOKEN= [SEP] +# export MODEL_TYPE=bert + +export BERT_MODEL=roberta-base +export TOKEN= +export MODEL_TYPE=roberta + # path to dataset files export TRAIN_PATH=data/CSAbstruct/train.jsonl @@ -14,12 +21,11 @@ export TEST_PATH=data/CSAbstruct/test.jsonl # model export USE_SEP=true # true for our model. false for baseline -export TOKEN=[SEP] export WITH_CRF=false # CRF only works for the baseline # training params export cuda_device=0 -export BATCH_SIZE=4 +export BATCH_SIZE=1 # set one for roberta export LR=5e-5 export TRAINING_DATA_INSTANCES=1668 export NUM_EPOCHS=2 diff --git a/sequential_sentence_classification/config.jsonnet b/sequential_sentence_classification/config.jsonnet index bc4069f..12e47f7 100644 --- a/sequential_sentence_classification/config.jsonnet +++ b/sequential_sentence_classification/config.jsonnet @@ -30,6 +30,7 @@ local boolToInt(s) = "max_sent_per_example": 10, "use_sep": stringToBool(std.extVar("USE_SEP")), "intersentence_token":std.extVar("TOKEN"), + "model_type":std.extVar("MODEL_TYPE"), "sci_sum": stringToBool(std.extVar("SCI_SUM")), "use_abstract_scores": stringToBool(std.extVar("USE_ABSTRACT_SCORES")), "sci_sum_fake_scores": stringToBool(std.extVar("SCI_SUM_FAKE_SCORES")), diff --git a/sequential_sentence_classification/model.py b/sequential_sentence_classification/model.py index 1d33a2e..699c151 100644 --- a/sequential_sentence_classification/model.py +++ b/sequential_sentence_classification/model.py @@ -26,6 +26,7 @@ def __init__(self, vocab: Vocabulary, bert_dropout: float = 0.1, sci_sum: bool = False, intersentence_token: str = "[SEP]", + model_type: str = "bert", additional_feature_size: int = 0, ) -> None: super(SeqClassificationModel, self).__init__(vocab) @@ -101,6 +102,14 @@ def forward(self, # type: ignore sentences_mask = sentences['bert']["token_ids"] == index_sep # mask for all the SEP tokens in the batch embedded_sentences = embedded_sentences[sentences_mask] # given batch_size x num_sentences_per_example x sent_len x vector_len # returns num_sentences_per_batch x vector_len + ## roberta only WORKS ONLY IF BATCH SIZE == 1 + if self.model_type == "roberta" : + assert batch_size == 1, "set batch size to 1 for RoBERTa" + indx = np.arange(embedded_sentences.shape[0]) + device = "cuda" + sel_idx = torch.from_numpy(indx[indx%2==0]).to(device)# select only scond intersentence marker + embedded_sentences = torch.index_select(embedded_sentences, 0, sel_idx) + assert embedded_sentences.dim() == 2 num_sentences = embedded_sentences.shape[0] # for the rest of the code in this model to work, think of the data we have as one example From 33716f2fb22e7e2dc36b26aa617237d4482d3808 Mon Sep 17 00:00:00 2001 From: UrszulaCzerwinska Date: Fri, 12 Mar 2021 17:28:33 +0000 Subject: [PATCH 6/8] fixed typos --- scripts/train.sh | 8 ++++---- sequential_sentence_classification/config.jsonnet | 15 +++++++-------- .../dataset_reader.py | 8 ++++---- sequential_sentence_classification/model.py | 2 ++ 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/scripts/train.sh b/scripts/train.sh index e41e7bd..0c87b90 100755 --- a/scripts/train.sh +++ b/scripts/train.sh @@ -10,10 +10,9 @@ export NUMPY_SEED=`expr $PYTORCH_SEED / 10` # export MODEL_TYPE=bert export BERT_MODEL=roberta-base -export TOKEN= +export TOKEN="" export MODEL_TYPE=roberta - # path to dataset files export TRAIN_PATH=data/CSAbstruct/train.jsonl export DEV_PATH=data/CSAbstruct/dev.jsonl @@ -27,7 +26,8 @@ export WITH_CRF=false # CRF only works for the baseline export cuda_device=0 export BATCH_SIZE=1 # set one for roberta export LR=5e-5 -export TRAINING_DATA_INSTANCES=1668 +#export TRAINING_DATA_INSTANCES=1668 +export TRAINING_STEPS=52 export NUM_EPOCHS=2 # limit number of sentneces per examples, and number of words per sentence. This is dataset dependant @@ -41,4 +41,4 @@ export SCI_SUM_FAKE_SCORES=false # use fake scores for testing CONFIG_FILE=sequential_sentence_classification/config.jsonnet -python -m allennlp train $CONFIG_FILE --include-package sequential_sentence_classification -s $SERIALIZATION_DIR "$@" +python3 -m allennlp train $CONFIG_FILE --include-package sequential_sentence_classification -s $SERIALIZATION_DIR "$@" diff --git a/sequential_sentence_classification/config.jsonnet b/sequential_sentence_classification/config.jsonnet index 12e47f7..6c9414f 100644 --- a/sequential_sentence_classification/config.jsonnet +++ b/sequential_sentence_classification/config.jsonnet @@ -29,8 +29,6 @@ local boolToInt(s) = "sent_max_len": std.parseInt(std.extVar("SENT_MAX_LEN")), "max_sent_per_example": 10, "use_sep": stringToBool(std.extVar("USE_SEP")), - "intersentence_token":std.extVar("TOKEN"), - "model_type":std.extVar("MODEL_TYPE"), "sci_sum": stringToBool(std.extVar("SCI_SUM")), "use_abstract_scores": stringToBool(std.extVar("USE_ABSTRACT_SCORES")), "sci_sum_fake_scores": stringToBool(std.extVar("SCI_SUM_FAKE_SCORES")), @@ -52,8 +50,10 @@ local boolToInt(s) = } } }, - "use_sep": std.extVar("USE_SEP"), - "with_crf": std.extVar("WITH_CRF"), + "use_sep": stringToBool(std.extVar("USE_SEP")), + "with_crf": stringToBool(std.extVar("WITH_CRF")), + "intersentence_token":std.extVar("TOKEN"), + "model_type":std.extVar("MODEL_TYPE"), "bert_dropout": 0.1, "sci_sum": stringToBool(std.extVar("SCI_SUM")), "additional_feature_size": boolToInt(stringToBool(std.extVar("USE_ABSTRACT_SCORES"))), @@ -68,15 +68,14 @@ local boolToInt(s) = "data_loader": { "batch_size": std.parseInt(std.extVar("BATCH_SIZE")), "shuffle": false, - } + }, "trainer": { "num_epochs": std.parseInt(std.extVar("NUM_EPOCHS")), "grad_clipping": 1.0, "patience": 5, - "model_save_interval": 3600, "validation_metric": if stringToBool(std.extVar("SCI_SUM")) then "-loss" else '+acc', "cuda_device": std.parseInt(std.extVar("cuda_device")), - "gradient_accumulation_batch_size": 32, + "num_gradient_accumulation_steps": 32, "optimizer": { "type": "huggingface_adamw", "lr": std.parseJson(std.extVar("LR")), @@ -85,7 +84,7 @@ local boolToInt(s) = "learning_rate_scheduler": { "type": "slanted_triangular", "num_epochs": std.parseInt(std.extVar("NUM_EPOCHS")), - "num_steps_per_epoch": std.parseInt(std.extVar("TRAINING_DATA_INSTANCES")) / 32, + "num_steps_per_epoch": std.parseInt(std.extVar("TRAINING_STEPS")), "cut_frac": 0.1, }, } diff --git a/sequential_sentence_classification/dataset_reader.py b/sequential_sentence_classification/dataset_reader.py index 9351657..7cbf970 100644 --- a/sequential_sentence_classification/dataset_reader.py +++ b/sequential_sentence_classification/dataset_reader.py @@ -4,6 +4,7 @@ from overrides import overrides import numpy as np +import copy from allennlp.data.dataset_readers.dataset_reader import DatasetReader from allennlp.common.file_utils import cached_path @@ -13,7 +14,7 @@ from allennlp.data.fields import TextField, LabelField, ListField, ArrayField, MultiLabelField from allennlp.data.token_indexers import SingleIdTokenIndexer from allennlp.data.tokenizers import WhitespaceTokenizer -from allennlp.data.tokenizers.token import Token +from allennlp.data.tokenizers.token_class import Token @DatasetReader.register("SeqClassificationReader") @@ -41,7 +42,7 @@ def __init__(self, predict: bool = False, ) -> None: super().__init__(manual_distributed_sharding=True, - manual_multiprocess_sharding=True, **kwargs) + manual_multiprocess_sharding=True) self._tokenizer = tokenizer or WhitespaceTokenizer() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self.sent_max_len = sent_max_len @@ -245,8 +246,7 @@ def apply_token_indexers(self, instance: Instance) -> None: for text_field in instance["sentences"].field_list: text_field.token_indexers = self._token_indexers - def shorten_sentences(self, origin_sent, max_len): - + def shorten_sentences(self, origin_sent, max_len): tokenized_sentences = [self._tokenizer.sequence_pair_start_tokens] for s in origin_sent: if len(self._tokenizer.tokenize(s)) > (max_len): diff --git a/sequential_sentence_classification/model.py b/sequential_sentence_classification/model.py index 699c151..1acb90b 100644 --- a/sequential_sentence_classification/model.py +++ b/sequential_sentence_classification/model.py @@ -1,6 +1,7 @@ import logging from typing import Dict +import numpy as np import torch from torch.nn import Linear from allennlp.data import Vocabulary @@ -39,6 +40,7 @@ def __init__(self, vocab: Vocabulary, self.self_attn = self_attn self.additional_feature_size = additional_feature_size self.token = intersentence_token + self.model_type = model_type self.dropout = torch.nn.Dropout(p=bert_dropout) # define loss From 9cc900bba596cb689a5ef4188d32e9507e10628c Mon Sep 17 00:00:00 2001 From: Urszula Czerwinska Date: Mon, 15 Mar 2021 12:03:42 +0100 Subject: [PATCH 7/8] update train.sh best params bert sep --- scripts/train.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scripts/train.sh b/scripts/train.sh index 0c87b90..75dfe3b 100755 --- a/scripts/train.sh +++ b/scripts/train.sh @@ -5,13 +5,13 @@ export PYTORCH_SEED=`expr $SEED / 10` export NUMPY_SEED=`expr $PYTORCH_SEED / 10` # path to bert type and path -# export BERT_MODEL=allenai/scibert_scivocab_uncased -# export TOKEN= [SEP] -# export MODEL_TYPE=bert +export BERT_MODEL=allenai/scibert_scivocab_uncased +export TOKEN=[SEP] +export MODEL_TYPE=bert -export BERT_MODEL=roberta-base -export TOKEN="" -export MODEL_TYPE=roberta +# export BERT_MODEL=roberta-base +# export TOKEN="" +# export MODEL_TYPE=roberta # path to dataset files export TRAIN_PATH=data/CSAbstruct/train.jsonl @@ -24,11 +24,11 @@ export WITH_CRF=false # CRF only works for the baseline # training params export cuda_device=0 -export BATCH_SIZE=1 # set one for roberta -export LR=5e-5 +export BATCH_SIZE=4 # set one for roberta +export LR=1e-5 #export TRAINING_DATA_INSTANCES=1668 export TRAINING_STEPS=52 -export NUM_EPOCHS=2 +export NUM_EPOCHS=20 # limit number of sentneces per examples, and number of words per sentence. This is dataset dependant export MAX_SENT_PER_EXAMPLE=10 From 96de0eda7bae96e9a72894966eca37f7e8e321c3 Mon Sep 17 00:00:00 2001 From: Urszula Czerwinska Date: Mon, 15 Mar 2021 12:04:29 +0100 Subject: [PATCH 8/8] update config.jsonnet shuffle before train --- sequential_sentence_classification/config.jsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sequential_sentence_classification/config.jsonnet b/sequential_sentence_classification/config.jsonnet index 6c9414f..987875c 100644 --- a/sequential_sentence_classification/config.jsonnet +++ b/sequential_sentence_classification/config.jsonnet @@ -67,7 +67,7 @@ local boolToInt(s) = }, "data_loader": { "batch_size": std.parseInt(std.extVar("BATCH_SIZE")), - "shuffle": false, + "shuffle": true, }, "trainer": { "num_epochs": std.parseInt(std.extVar("NUM_EPOCHS")), @@ -88,4 +88,4 @@ local boolToInt(s) = "cut_frac": 0.1, }, } -} \ No newline at end of file +}