From bdeb09b3a1e4d48952859852b75f3def17dddf9a Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Sat, 9 Jan 2021 14:51:47 +0900 Subject: [PATCH 01/48] run and sometimes fail by myself --- download_models.sh | 12 ++++++++++++ lama/modules/base_connector.py | 4 ++-- scripts/run_experiments.py | 34 +++++++++++++++++++++++++++------- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/download_models.sh b/download_models.sh index 886092b..6a726d8 100755 --- a/download_models.sh +++ b/download_models.sh @@ -131,6 +131,18 @@ if [[ ! -f bert/cased_L-24_H-1024_A-16/bert_config.json ]]; then cd ../../ fi +echo "RoBERTa" +if [[ ! -f roberta/roberta-base/config.json ]]; then + rm -rf 'roberta/roberta-base' + mkdir -p 'roberta/roberta-base' + cd 'roberta/roberta-base' + wget 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json' -O vocab.json + wget 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt' -O merges.txt + wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin' -O 'pytorch_model.bin' + wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json' -O 'config.json' + cd ../.. +fi + cd "$ROOD_DIR" echo 'Building common vocab' diff --git a/lama/modules/base_connector.py b/lama/modules/base_connector.py index e32bf40..49041bf 100644 --- a/lama/modules/base_connector.py +++ b/lama/modules/base_connector.py @@ -92,8 +92,8 @@ def try_cuda(self): print('Moving model to CUDA') self._cuda() self._model_device = 'cuda' - else: - print('No CUDA found') + # else: + # print('No CUDA found') def _cuda(self): """Move model to GPU.""" diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 024e681..3ace2cb 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -17,13 +17,13 @@ from collections import defaultdict LMs = [ - { - "lm": "transformerxl", - "label": "transformerxl", - "models_names": ["transformerxl"], - "transformerxl_model_name": "transfo-xl-wt103", - "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", - }, + #{ + # "lm": "transformerxl", + # "label": "transformerxl", + # "models_names": ["transformerxl"], + # "transformerxl_model_name": "transfo-xl-wt103", + # "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", + #}, { "lm": "elmo", "label": "elmo", @@ -56,6 +56,21 @@ "bert_model_name": "bert-large-cased", "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", }, + #{ + # "lm": "gpt", + # "label": "gpt", + # "models_names": ["gpt"], + # "gpt_model_name": "openai-gpt", + # "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt/", + #}, + #{ + # "lm": "roberta", + # "label": "roberta", + # "models_names": ["roberta"], + # "roberta_model_name": "pytorch_model.bin", + # "roberta_model_dir": "pre-trained_language_models/roberta/roberta-base/", + # "roberta_vocab_name": "vocab-2016-09-10.txt", + #}, ] @@ -120,6 +135,11 @@ def run_experiments( print("Exception: {}".format(e)) continue + # https://github.com/facebookresearch/LAMA/issues/30 + if model is not None: + del model + model = None + if model is None: [model_type_name] = args.models_names model = build_model_by_name(model_type_name, args) From 3beeedc4e8665f44098fd0c5abbd75edf3bee704 Mon Sep 17 00:00:00 2001 From: morioka Date: Sat, 9 Jan 2021 15:19:50 +0900 Subject: [PATCH 02/48] Create README_morioka.md --- README_morioka.md | 64 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 README_morioka.md diff --git a/README_morioka.md b/README_morioka.md new file mode 100644 index 0000000..72b98c6 --- /dev/null +++ b/README_morioka.md @@ -0,0 +1,64 @@ +# LAMA(LAnguage Model Analysis) を再確認してみた + +2021-01-09 + +Yasuhiro MORIOKA + +## 概要 + +LAMA(LAnguage Model Analysis) の環境をそのまま使って、結果の再現を試みた。README.md の手順どおりに進めただけで、独自のデータセットやモデルで確認していない。 + +BERT, BERT-large, Elmoについては Google-RE, T-REx でほぼ同様の結果を得た。Elmo-5Bは未確認。 + +対応されているはずの Transofmer-XL, GPT, RoBERTa は、いずれも実行できなかった。 + +## 内容 + +* 環境 + * ThinkPad E495 (AMD Ryzen5 2.1GHz, RAM 32GB, GPUなし) +* 修正 + * Elmoモデルの状態クリアを追加 + * https://github.com/facebookresearch/LAMA/issues/30 + * GPT, RoBERTa 向け pre-trained_language_models を定義 + * CUDAが利用できない場合の警告メッセージを抑制 + +* 実行 + * "The LAMA probe" の手順をそのまま実行。 + +* 実行時の注意 + + * pyenv-virtualenv 構成で minoconda-3.7 環境を用意し、さらに lama37環境を用意した。 + * pyenv と anaconda のactivateが衝突するので、次の手順で回避。 + * またローカルのlamaモジュールをロードできるようPYTHONPATHを修正。 + + ``` + $ pyenv activate miniconda-3.7.0/envs/lama37 + $ export PYTHONPATH=.'' + $ python scripts/run_experiments.py 2>&1 | tee output.log + ``` + + * 参考 + * pyenvとanacondaを共存させる時のactivate衝突問題の回避策3種類 - Qiita + * https://qiita.com/y__sama/items/f732bb7bec2bff355b69 + * ModuleNotFoundError: No module named 'lama' · Issue #20 · facebookresearch/LAMA + * https://github.com/facebookresearch/LAMA/issues/20 + + +* 結果 + * BERT, BERT-large .. ほぼ論文のとおり。 + * Elmo .. Google-RE, T-REx の評価後、ConceptNetでの評価に移るが途中でエラーも警告も出力せずに終了する。 + * Elmo-5B .. 未実施 + * Transformer-XL .. RuntimeError: $ Torch: invalid memory size -- maybe an overflow? at /pytorch/aten/src/TH/THGeneral.cpp:188 エラー。 + * GPT .. 大量の word FOO from vocab_subset in model vocabulary! 警告が表示され、評価回数が0となって div0 エラー。 + * RoBERTa .. モデルのロードに失敗。huggingface roberta-baseでなく pytorch/fairseq のモデルを使う必要があるのかもしれない。 + +## 参考 + +* https://github.com/facebookresearch/LAMA +* https://arxiv.org/pdf/1909.01066.pdf +* https://openreview.net/forum?id=025X0zPfn + +* http://lotus.kuee.kyoto-u.ac.jp/~kurita/snlp2019_kurita.pdf +* https://blog.hoxo-m.com/entry/2019/10/24/083000#3-Language-Models-as-Knowledge-Bases +* https://twitter.com/gneubig/status/1177276621172150272 + From 5e3321417c3f13a828c6b11b16ff4a7828fe18b6 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Tue, 12 Jan 2021 23:49:16 +0900 Subject: [PATCH 03/48] fix loading RoBERTa --- README_morioka.md | 13 +++++-- download_models.sh | 17 ++++---- lama/vocab_intersection.py | 24 ++++++++---- scripts/run_experiments.py | 80 +++++++++++++++++++------------------- 4 files changed, 74 insertions(+), 60 deletions(-) diff --git a/README_morioka.md b/README_morioka.md index 72b98c6..b355693 100644 --- a/README_morioka.md +++ b/README_morioka.md @@ -1,6 +1,6 @@ # LAMA(LAnguage Model Analysis) を再確認してみた -2021-01-09 +2021-01-12 Yasuhiro MORIOKA @@ -10,16 +10,21 @@ LAMA(LAnguage Model Analysis) の環境をそのまま使って、結果の再 BERT, BERT-large, Elmoについては Google-RE, T-REx でほぼ同様の結果を得た。Elmo-5Bは未確認。 -対応されているはずの Transofmer-XL, GPT, RoBERTa は、いずれも実行できなかった。 +Transformer-XL, GPTはおそらくメモリサイズの問題で実行不可。 +Elmo, RoBERTa も ConceptNet での評価中におそらくメモリサイズの問題で実行不可。 ## 内容 * 環境 * ThinkPad E495 (AMD Ryzen5 2.1GHz, RAM 32GB, GPUなし) + * Windows 10 Home, WSL2, Ubuntu 20.04 * 修正 * Elmoモデルの状態クリアを追加 * https://github.com/facebookresearch/LAMA/issues/30 * GPT, RoBERTa 向け pre-trained_language_models を定義 + * RoBERTa向けモデルダウンロード、vocaburaryのintersection取得の修正 + * huggingface roberta-base でなくfairseq roberta.baseを利用するconnectorコードらしい。 + * https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.md * CUDAが利用できない場合の警告メッセージを抑制 * 実行 @@ -50,7 +55,7 @@ BERT, BERT-large, Elmoについては Google-RE, T-REx でほぼ同様の結果 * Elmo-5B .. 未実施 * Transformer-XL .. RuntimeError: $ Torch: invalid memory size -- maybe an overflow? at /pytorch/aten/src/TH/THGeneral.cpp:188 エラー。 * GPT .. 大量の word FOO from vocab_subset in model vocabulary! 警告が表示され、評価回数が0となって div0 エラー。 - * RoBERTa .. モデルのロードに失敗。huggingface roberta-baseでなく pytorch/fairseq のモデルを使う必要があるのかもしれない。 + * RoBERTa .. ConceptNetの評価中にメモリ確保エラー。 ## 参考 @@ -58,6 +63,8 @@ BERT, BERT-large, Elmoについては Google-RE, T-REx でほぼ同様の結果 * https://arxiv.org/pdf/1909.01066.pdf * https://openreview.net/forum?id=025X0zPfn +* https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.md + * http://lotus.kuee.kyoto-u.ac.jp/~kurita/snlp2019_kurita.pdf * https://blog.hoxo-m.com/entry/2019/10/24/083000#3-Language-Models-as-Knowledge-Bases * https://twitter.com/gneubig/status/1177276621172150272 diff --git a/download_models.sh b/download_models.sh index 6a726d8..a822f4e 100755 --- a/download_models.sh +++ b/download_models.sh @@ -132,15 +132,14 @@ if [[ ! -f bert/cased_L-24_H-1024_A-16/bert_config.json ]]; then fi echo "RoBERTa" -if [[ ! -f roberta/roberta-base/config.json ]]; then - rm -rf 'roberta/roberta-base' - mkdir -p 'roberta/roberta-base' - cd 'roberta/roberta-base' - wget 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json' -O vocab.json - wget 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt' -O merges.txt - wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin' -O 'pytorch_model.bin' - wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json' -O 'config.json' - cd ../.. +if [[ ! -f roberta/roberta.base/dict.txt ]]; then + rm -rf 'roberta/roberta.base' + mkdir -p 'roberta/roberta.base' + cd 'roberta' + wget -c 'https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz' + tar -xzf roberta.base.tar.gz + rm roberta.base.tar.gz + cd .. fi diff --git a/lama/vocab_intersection.py b/lama/vocab_intersection.py index 235024c..47b19c2 100644 --- a/lama/vocab_intersection.py +++ b/lama/vocab_intersection.py @@ -21,11 +21,11 @@ # "cpu": True, # "output_dictionary_size": -1 # }, - { - # "TransformerXL" - "lm": "transformerxl", - "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", - }, + # { + # # "TransformerXL" + # "lm": "transformerxl", + # "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", + # }, { # "ELMO ORIGINAL" "lm": "elmo", @@ -37,7 +37,7 @@ { # "ELMO ORIGINAL 5.5B" "lm": "elmo", - "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B/", + "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B", "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B", "elmo_vocab_name": "vocab-enwiki-news-500000.txt", "elmo_warm_up_cycles": 5 @@ -46,15 +46,23 @@ # "BERT BASE CASED" "lm": "bert", "bert_model_name": "bert-base-cased", - "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12/", + "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", "bert_vocab_name": "vocab.txt" }, { # "BERT LARGE CASED" "lm" : "bert", "bert_model_name": "bert-large-cased", - "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16/", + "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", "bert_vocab_name": "vocab.txt" + }, + { + # "RoBERTa base" + "lm" : "roberta", + "roberta_model_name": "model.pt", + "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", + "roberta_vocab_name": "dict.txt", + "max_sentence_length": 100 } ] diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 3ace2cb..cbce3c9 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -24,38 +24,38 @@ # "transformerxl_model_name": "transfo-xl-wt103", # "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", #}, - { - "lm": "elmo", - "label": "elmo", - "models_names": ["elmo"], - "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway", - "elmo_vocab_name": "vocab-2016-09-10.txt", - "elmo_model_dir": "pre-trained_language_models/elmo/original", - "elmo_warm_up_cycles": 10, - }, - { - "lm": "elmo", - "label": "elmo5B", - "models_names": ["elmo"], - "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B", - "elmo_vocab_name": "vocab-enwiki-news-500000.txt", - "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B/", - "elmo_warm_up_cycles": 10, - }, - { - "lm": "bert", - "label": "bert_base", - "models_names": ["bert"], - "bert_model_name": "bert-base-cased", - "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", - }, - { - "lm": "bert", - "label": "bert_large", - "models_names": ["bert"], - "bert_model_name": "bert-large-cased", - "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", - }, + #{ + # "lm": "elmo", + # "label": "elmo", + # "models_names": ["elmo"], + # "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway", + # "elmo_vocab_name": "vocab-2016-09-10.txt", + # "elmo_model_dir": "pre-trained_language_models/elmo/original", + # "elmo_warm_up_cycles": 10, + #}, + #{ + # "lm": "elmo", + # "label": "elmo5B", + # "models_names": ["elmo"], + # "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B", + # "elmo_vocab_name": "vocab-enwiki-news-500000.txt", + # "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B/", + # "elmo_warm_up_cycles": 10, + #}, + #{ + # "lm": "bert", + # "label": "bert_base", + # "models_names": ["bert"], + # "bert_model_name": "bert-base-cased", + # "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", + #}, + #{ + # "lm": "bert", + # "label": "bert_large", + # "models_names": ["bert"], + # "bert_model_name": "bert-large-cased", + # "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", + #}, #{ # "lm": "gpt", # "label": "gpt", @@ -63,14 +63,14 @@ # "gpt_model_name": "openai-gpt", # "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt/", #}, - #{ - # "lm": "roberta", - # "label": "roberta", - # "models_names": ["roberta"], - # "roberta_model_name": "pytorch_model.bin", - # "roberta_model_dir": "pre-trained_language_models/roberta/roberta-base/", - # "roberta_vocab_name": "vocab-2016-09-10.txt", - #}, + { + "lm": "roberta", + "label": "roberta.base", + "models_names": ["roberta"], + "roberta_model_name": "model.pt", + "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", + "roberta_vocab_name": "dict.txt", + }, ] From 7669dd2964cd29e6b9cf479c4035629dcc5b641d Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 13 Jan 2021 00:27:30 +0900 Subject: [PATCH 04/48] add TODO --- README_morioka.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README_morioka.md b/README_morioka.md index b355693..94c5ff1 100644 --- a/README_morioka.md +++ b/README_morioka.md @@ -57,6 +57,10 @@ Elmo, RoBERTa も ConceptNet での評価中におそらくメモリサイズの * GPT .. 大量の word FOO from vocab_subset in model vocabulary! 警告が表示され、評価回数が0となって div0 エラー。 * RoBERTa .. ConceptNetの評価中にメモリ確保エラー。 +* TODO + * P27に対応する文テンプレートが T-REx では不適切 https://github.com/facebookresearch/LAMA/issues/40 + * fairseq RoBERTaでなくhuggingface RoBERTaをロードしたい https://github.com/facebookresearch/LAMA/issues/15 + ## 参考 * https://github.com/facebookresearch/LAMA From cdaefb8b11529bd866abe386fec3b23c6f4ed442 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 13 Jan 2021 08:49:04 +0900 Subject: [PATCH 05/48] add memo regarding RoBERTa --- README_morioka.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README_morioka.md b/README_morioka.md index 94c5ff1..d8083cb 100644 --- a/README_morioka.md +++ b/README_morioka.md @@ -55,7 +55,8 @@ Elmo, RoBERTa も ConceptNet での評価中におそらくメモリサイズの * Elmo-5B .. 未実施 * Transformer-XL .. RuntimeError: $ Torch: invalid memory size -- maybe an overflow? at /pytorch/aten/src/TH/THGeneral.cpp:188 エラー。 * GPT .. 大量の word FOO from vocab_subset in model vocabulary! 警告が表示され、評価回数が0となって div0 エラー。 - * RoBERTa .. ConceptNetの評価中にメモリ確保エラー。 + * RoBERTa .. BERTよりも少し悪い。ConceptNetの評価中にメモリ確保エラー。 + * https://github.com/facebookresearch/LAMA/issues/16 * TODO * P27に対応する文テンプレートが T-REx では不適切 https://github.com/facebookresearch/LAMA/issues/40 From 8bfa3796b96abae6bc194b116d2f96ef21fb0cb5 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 18 Jan 2021 05:15:16 +0900 Subject: [PATCH 06/48] fix options typo --- lama/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lama/options.py b/lama/options.py index a767ef7..1f4a552 100644 --- a/lama/options.py +++ b/lama/options.py @@ -41,7 +41,7 @@ def get_general_parser(): dest="max_sentence_length", type=int, default=100, - help="max sentence lenght", + help="max sentence length", ) __add_bert_args(parser) __add_elmo_args(parser) From 1f72512e3a809c4772160bb1cc86a213fe6daf2c Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 27 Jan 2021 01:51:30 +0900 Subject: [PATCH 07/48] add HF RoBERTa connector (work in progress) --- download_models.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/download_models.sh b/download_models.sh index a822f4e..3ee8074 100755 --- a/download_models.sh +++ b/download_models.sh @@ -142,6 +142,18 @@ if [[ ! -f roberta/roberta.base/dict.txt ]]; then cd .. fi +echo "HuggingFace RoBERTa" +if [[ ! -f roberta/roberta-base/config.json ]]; then + rm -rf 'roberta/roberta-base' + mkdir -p 'roberta/roberta-base' + cd 'roberta/roberta-base' + wget 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json' -O vocab.json + wget 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt' -O merges.txt + wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin' -O 'pytorch_model.bin' + wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json' -O 'config.json' + cd ../.. +fi + cd "$ROOD_DIR" echo 'Building common vocab' From 65f15f0d918dcac47a592c421c8f1047ac946f0b Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 27 Jan 2021 01:54:49 +0900 Subject: [PATCH 08/48] add HF RoBERTa connector (work in progress) --- lama/modules/base_connector.py | 12 +- lama/modules/hfroberta_connector.py | 248 ++++++++++++++++++++++++++++ lama/vocab_intersection.py | 8 + scripts/batch_eval_KB_completion.py | 7 +- scripts/run_experiments.py | 19 ++- 5 files changed, 285 insertions(+), 9 deletions(-) create mode 100644 lama/modules/hfroberta_connector.py diff --git a/lama/modules/base_connector.py b/lama/modules/base_connector.py index 49041bf..ea0fef1 100644 --- a/lama/modules/base_connector.py +++ b/lama/modules/base_connector.py @@ -17,10 +17,13 @@ ELMO_END_SENTENCE = "" OPENAI_UNK = "" OPENAI_EOS = "" -ROBERTA_MASK = "" +ROBERTA_MASK = "" ROBERTA_START_SENTENCE = "" ROBERTA_END_SENTENCE = "" ROBERTA_VOCAB_SIZE = 50266 +ROBERTA_UNK = "" +ROBERTA_PAD = "" + SPECIAL_SYMBOLS = [ MASK, @@ -32,7 +35,12 @@ ELMO_START_SENTENCE, ELMO_END_SENTENCE, OPENAI_UNK, - OPENAI_EOS + OPENAI_EOS, + ROBERTA_MASK, + # ROBERTA_UNK, + ROBERTA_PAD, + ROBERTA_START_SENTENCE, + ROBERTA_END_SENTENCE, ] SPACE_NORMALIZER = re.compile(r"\s+") diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py new file mode 100644 index 0000000..7d8b107 --- /dev/null +++ b/lama/modules/hfroberta_connector.py @@ -0,0 +1,248 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# +from pytorch_transformers import RobertaTokenizer, RobertaModel, RobertaForMaskedLM + +import torch +import numpy as np +from lama.modules.base_connector import * + +import torch.nn.functional as F + +class HfRoberta(Base_Connector): + + def __init__(self, args): + super().__init__() + + if args.hfroberta_model_dir is not None: + # load bert model from file + roberta_model_name = str(args.hfroberta_model_dir) + "/" + dict_file = roberta_model_name + print("loading huggingface RoBERTa model from {}".format(roberta_model_name)) + else: + # load RoBERTa model from huggingface cache + roberta_model_name = args.hfroberta_model_name + dict_file = roberta_model_name + + # When using a cased model, make sure to pass do_lower_case=False directly to BaseTokenizer + do_lower_case = False + if 'uncased' in roberta_model_name: + do_lower_case=True + + # Load pre-trained model tokenizer (vocabulary) + self.tokenizer = RobertaTokenizer.from_pretrained(dict_file) + + # original vocab + self.map_indices = None + + # GPT uses different way to represent BPE then BERT. Namely, the + # final suffixes are indicated with suffix, while pieces that must + # be followed are written as is. In BERT the prefixes are written as is + # while the parts that must follow (not be followed!) have '##' prefix. + # There is no one-to-one coversion. But at least we may make pieces that + # may form a full word look the same. + # Note that we should be very careful now, + # tokenizer.convert_tokens_to_ids won't work with our vocabulary. + def convert_word(word): + if word == ROBERTA_UNK: # word == OPENAI_UNK: + return word + if word == '\n': + # Redefine symbol EOS to improve visualization. + return ROBERTA_EOS # OPENAI_EOS + # return word[:-4] if word.endswith('') else f'{word}##' + return word[:-4] if word.endswith('') else f'{word}' + + _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items())) + self.vocab = [convert_word(word) for word in gpt_vocab] + self._init_inverse_vocab() + + # Get UNK symbol as it's written in the origin RoBERTa vocab. + unk_index = self.inverse_vocab[ROBERTA_UNK] # OPENAI_UNK + self.unk_symbol = self.tokenizer.decoder[unk_index] + + # Get MASK symbol as it's written in the origin RoBERTa vocab. + mask_index = self.inverse_vocab[ROBERTA_MASK] + self.mask_symbol = self.tokenizer.decoder[mask_index] + + # Load pre-trained model (weights) + self.masked_roberta_model = RobertaForMaskedLM.from_pretrained(roberta_model_name) + self.masked_roberta_model.eval() + print(self.masked_roberta_model.config) + + # ... to get hidden states + self.roberta_model = self.masked_roberta_model.roberta + + # Sanity check. + #assert len(self.vocab) == self.masked_roberta_model.config.vocab_size + #assert 0 == self.masked_roberta_model.config.n_special + + self.eos_id = self.inverse_vocab[ROBERTA_END_SENTENCE] # OPENAI_EOS + self.model_vocab = self.vocab + + self.pad_id = self.inverse_vocab[ROBERTA_PAD] + self.unk_index = self.inverse_vocab[ROBERTA_UNK] + self.mask_index = mask_index + + def __get_token_ids_from_tensor(self, indexed_string): + token_ids = [] + if self.map_indices is not None: + # map indices to subset of the vocabulary + indexed_string = self.convert_ids(indexed_string) + token_ids = np.asarray(indexed_string) + else: + token_ids = indexed_string + return token_ids + + def _cuda(self): + self.masked_roberta_model.cuda() + + def get_id(self, string): + tokenized_text = self.tokenizer.tokenize(string) + indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) + # indexed_string = self.convert_ids(indexed_string) + return indexed_string + + def __get_input_tensors_batch(self, sentences_list): + tokens_tensors_list = [] + segments_tensors_list = [] + masked_indices_list = [] + tokenized_text_list = [] + max_tokens = 0 + for sentences in sentences_list: + tokens_tensor, segments_tensor, masked_indices, tokenized_text = self.__get_input_tensors(sentences) + tokens_tensors_list.append(tokens_tensor) + segments_tensors_list.append(segments_tensor) + masked_indices_list.append(masked_indices) + tokenized_text_list.append(tokenized_text) + # assert(tokens_tensor.shape[1] == segments_tensor.shape[1]) + if (tokens_tensor.shape[1] > max_tokens): + max_tokens = tokens_tensor.shape[1] + # print("MAX_TOKENS: {}".format(max_tokens)) + # apply padding and concatenate tensors + # use [PAD] for tokens and 0 for segments + final_tokens_tensor = None + final_segments_tensor = None + final_attention_mask = None + for tokens_tensor, segments_tensor in zip(tokens_tensors_list, segments_tensors_list): + dim_tensor = tokens_tensor.shape[1] + pad_lenght = max_tokens - dim_tensor + attention_tensor = torch.full([1,dim_tensor], 1, dtype= torch.long) + if pad_lenght>0: + pad_1 = torch.full([1,pad_lenght], self.pad_id, dtype= torch.long) + pad_2 = torch.full([1,pad_lenght], 0, dtype= torch.long) + attention_pad = torch.full([1,pad_lenght], 0, dtype= torch.long) + tokens_tensor = torch.cat((tokens_tensor,pad_1), dim=1) + segments_tensor = torch.cat((segments_tensor,pad_2), dim=1) + attention_tensor = torch.cat((attention_tensor,attention_pad), dim=1) + if final_tokens_tensor is None: + final_tokens_tensor = tokens_tensor + final_segments_tensor = segments_tensor + final_attention_mask = attention_tensor + else: + final_tokens_tensor = torch.cat((final_tokens_tensor,tokens_tensor), dim=0) + final_segments_tensor = torch.cat((final_segments_tensor,segments_tensor), dim=0) + final_attention_mask = torch.cat((final_attention_mask,attention_tensor), dim=0) + # print(final_tokens_tensor) + # print(final_segments_tensor) + # print(final_attention_mask) + # print(final_tokens_tensor.shape) + # print(final_segments_tensor.shape) + # print(final_attention_mask.shape) + return final_tokens_tensor, final_segments_tensor, final_attention_mask, masked_indices_list, tokenized_text_list + + def __get_input_tensors(self, sentences): + tokenized_text = [] + masked_indices = [] + segment_indices = [] + for sentence_idx, sentence in enumerate(sentences): + if sentence_idx > 0: + tokenized_text.append(ROBERTA_END_SENTENCE) # OPENAI_EOS) + for chunk_idx, chunk in enumerate(sentence.split('[MASK]')): + if chunk_idx > 0: + masked_indices.append(len(tokenized_text)) + segment_indices.append(sentence_idx) + tokenized_text.append(self.mask_symbol) + chunk = chunk.strip() + if chunk: + tokenized_sentence = self.tokenizer.tokenize(chunk) + segment_id = np.full(len(tokenized_sentence), + sentence_idx, + dtype=int).tolist() + + tokenized_text.extend(tokenized_sentence) + segment_indices.extend(segment_id) + + # add [CLS] token at the beginning + tokenized_text.insert(0,ROBERTA_START_SENTENCE) + segment_indices.insert(0,0) + + # look for masked indices + masked_indices = [] + for i in range(len(tokenized_text)): + token = tokenized_text[i] + if token == ROBERTA_MASK: # MASK + masked_indices.append(i) + + indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text) + + # Convert inputs to PyTorch tensors + tokens_tensor = torch.tensor([indexed_tokens]) + segments_tensors = torch.tensor([segment_indices]) + + return tokens_tensor, segments_tensors, masked_indices, tokenized_text + + + def get_batch_generation(self, sentences_list, logger=None, try_cuda=True): + if not sentences_list: + return None + if try_cuda: + self.try_cuda() + + tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list) + + if logger is not None: + logger.debug("\n{}\n".format(tokenized_text_list)) + + with torch.no_grad(): + logits = self.masked_roberta_model( + input_ids=tokens_tensor.to(self._model_device), + token_type_ids=segments_tensor.to(self._model_device), + attention_mask=attention_mask_tensor.to(self._model_device), + ) + if isinstance(logits, tuple): # ケースによって、tupleだったり、そうでなかったり.. + logits = logits[0] + + log_probs = F.log_softmax(logits, dim=-1).cpu() + + token_ids_list = [] + for indexed_string in tokens_tensor.numpy(): + token_ids_list.append(self.__get_token_ids_from_tensor(indexed_string)) + + return log_probs, token_ids_list, masked_indices_list + + def get_contextual_embeddings(self, sentences_list, try_cuda=True): + + # assume in input 1 or 2 sentences - in general, it considers only the first 2 sentences + if not sentences_list: + return None + if try_cuda: + self.try_cuda() + + tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list) + + with torch.no_grad(): + all_encoder_layers, _ = self.roberta_model( + tokens_tensor.to(self._model_device), + segments_tensor.to(self._model_device)) + + all_encoder_layers = [layer.cpu() for layer in all_encoder_layers] + + sentence_lengths = [len(x) for x in tokenized_text_list] + + # all_encoder_layers: a list of the full sequences of encoded-hidden-states at the end + # of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each + # encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size] + return all_encoder_layers, sentence_lengths, tokenized_text_list diff --git a/lama/vocab_intersection.py b/lama/vocab_intersection.py index 47b19c2..0c02555 100644 --- a/lama/vocab_intersection.py +++ b/lama/vocab_intersection.py @@ -63,6 +63,14 @@ "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", "roberta_vocab_name": "dict.txt", "max_sentence_length": 100 + }, + { + # "hfRoBERTa base" + "lm" : "hfroberta", + "hfroberta_model_name": "roberta-base", + "hfroberta_model_dir": None, +# "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", +# "hfroberta_vocab_name": "vocab.txt", } ] diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index 1b8fec4..4d71579 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -157,7 +157,8 @@ def run_thread(arguments): label_index=arguments["label_index"], index_list=arguments["index_list"], print_generation=arguments["interactive"], - topk=10000, +# topk=10000, + topk=1000, ) msg += "\n" + return_msg @@ -318,6 +319,10 @@ def main(args, shuffle_data=True, model=None): model_name = "BERT_{}".format(args.bert_model_name) elif model_type_name == "elmo": model_name = "ELMo_{}".format(args.elmo_model_name) + elif model_type_name == "roberta": + model_name = "RoBERTa_{}".format(args.roberta_model_name) + elif model_type_name == "hfroberta": + model_name = "hfRoBERTa_{}".format(args.hfroberta_model_name) else: model_name = model_type_name.title() diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index cbce3c9..2f40740 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -63,13 +63,20 @@ # "gpt_model_name": "openai-gpt", # "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt/", #}, + #{ + # "lm": "roberta", + # "label": "roberta.base", + # "models_names": ["roberta"], + # "roberta_model_name": "model.pt", + # "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", + # "roberta_vocab_name": "dict.txt", + #}, { - "lm": "roberta", - "label": "roberta.base", - "models_names": ["roberta"], - "roberta_model_name": "model.pt", - "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", - "roberta_vocab_name": "dict.txt", + "lm": "hfroberta", + "label": "rober-ta-base", + "models_names": ["hfroberta"], + "hfroberta_model_name": "roberta-base", + "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", }, ] From ab486727f1d3f6f3ce75b0c5d204a6f4134bd47f Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 27 Jan 2021 08:08:53 +0900 Subject: [PATCH 09/48] fix --- lama/modules/__init__.py | 4 +++- scripts/run_experiments.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lama/modules/__init__.py b/lama/modules/__init__.py index 73c9fcf..f02e310 100644 --- a/lama/modules/__init__.py +++ b/lama/modules/__init__.py @@ -9,6 +9,7 @@ from .gpt_connector import GPT from .transformerxl_connector import TransformerXL from .roberta_connector import Roberta +from .hfroberta_connector import HfRoberta def build_model_by_name(lm, args, verbose=True): @@ -22,7 +23,8 @@ def build_model_by_name(lm, args, verbose=True): bert=Bert, gpt=GPT, transformerxl=TransformerXL, - roberta=Roberta + roberta=Roberta, + hfroberta=HfRoberta ) if lm not in MODEL_NAME_TO_CLASS: raise ValueError("Unrecognized Language Model: %s." % lm) diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 2f40740..3663f61 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -73,7 +73,7 @@ #}, { "lm": "hfroberta", - "label": "rober-ta-base", + "label": "roberta-base", "models_names": ["hfroberta"], "hfroberta_model_name": "roberta-base", "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", From 29d4d87bd3c2d134896a13d5bea51469ce3ee672 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 27 Jan 2021 08:09:29 +0900 Subject: [PATCH 10/48] avoid div0 --- scripts/batch_eval_KB_completion.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index 4d71579..bd5db3c 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -651,6 +651,9 @@ def main(args, shuffle_data=True, model=None): pool.join() # stats + if len(list_of_results) == 0: + list_of_results = 1e18 # avoid div0 + # Mean reciprocal rank MRR /= len(list_of_results) From a05717e080dd0f88a66acfd2130234ecf0399c47 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 27 Jan 2021 08:43:42 +0900 Subject: [PATCH 11/48] fix --- lama/modules/hfroberta_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index 7d8b107..b7dae03 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -70,7 +70,7 @@ def convert_word(word): # Load pre-trained model (weights) self.masked_roberta_model = RobertaForMaskedLM.from_pretrained(roberta_model_name) self.masked_roberta_model.eval() - print(self.masked_roberta_model.config) + #print(self.masked_roberta_model.config) # ... to get hidden states self.roberta_model = self.masked_roberta_model.roberta From 79ffbfd201faa09c7fd49a2bf16f8ec7d7a5f7fe Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 27 Jan 2021 10:18:28 +0900 Subject: [PATCH 12/48] fix --- scripts/batch_eval_KB_completion.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index bd5db3c..eb0fe40 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -158,7 +158,7 @@ def run_thread(arguments): index_list=arguments["index_list"], print_generation=arguments["interactive"], # topk=10000, - topk=1000, + topk=5000, ) msg += "\n" + return_msg @@ -651,15 +651,15 @@ def main(args, shuffle_data=True, model=None): pool.join() # stats - if len(list_of_results) == 0: - list_of_results = 1e18 # avoid div0 - - # Mean reciprocal rank - MRR /= len(list_of_results) - - # Precision - Precision /= len(list_of_results) - Precision1 /= len(list_of_results) + try: + # Mean reciprocal rank + MRR /= len(list_of_results) + + # Precision + Precision /= len(list_of_results) + Precision1 /= len(list_of_results) + except ZeriDivisionError: + MRR = Precision = Precision1 = 0.0 msg = "all_samples: {}\n".format(len(all_samples)) msg += "list_of_results: {}\n".format(len(list_of_results)) From aecca5d0f89dc78455f72bd9409af30c76c66b69 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 27 Jan 2021 10:19:21 +0900 Subject: [PATCH 13/48] turn-on bert-base case --- scripts/run_experiments.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 3663f61..8bae3ff 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -42,13 +42,13 @@ # "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B/", # "elmo_warm_up_cycles": 10, #}, - #{ - # "lm": "bert", - # "label": "bert_base", - # "models_names": ["bert"], - # "bert_model_name": "bert-base-cased", - # "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", - #}, + { + "lm": "bert", + "label": "bert_base", + "models_names": ["bert"], + "bert_model_name": "bert-base-cased", + "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", + }, #{ # "lm": "bert", # "label": "bert_large", From 5df078bb2ca0b083a7ff3d61a596f00cb12a935c Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 27 Jan 2021 10:30:31 +0900 Subject: [PATCH 14/48] fix --- scripts/batch_eval_KB_completion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index eb0fe40..82bdeba 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -658,7 +658,7 @@ def main(args, shuffle_data=True, model=None): # Precision Precision /= len(list_of_results) Precision1 /= len(list_of_results) - except ZeriDivisionError: + except ZeroDivisionError: MRR = Precision = Precision1 = 0.0 msg = "all_samples: {}\n".format(len(all_samples)) From 53a480633d6fb64ac590e2582ac3074a8fcaf0cb Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Thu, 28 Jan 2021 00:33:09 +0900 Subject: [PATCH 15/48] clean --- lama/vocab_intersection.py | 2 -- scripts/batch_eval_KB_completion.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/lama/vocab_intersection.py b/lama/vocab_intersection.py index 0c02555..7292a8b 100644 --- a/lama/vocab_intersection.py +++ b/lama/vocab_intersection.py @@ -69,8 +69,6 @@ "lm" : "hfroberta", "hfroberta_model_name": "roberta-base", "hfroberta_model_dir": None, -# "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", -# "hfroberta_vocab_name": "vocab.txt", } ] diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index 82bdeba..1b83d49 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -157,8 +157,7 @@ def run_thread(arguments): label_index=arguments["label_index"], index_list=arguments["index_list"], print_generation=arguments["interactive"], -# topk=10000, - topk=5000, + topk=10000, ) msg += "\n" + return_msg From 3ca10c1e6cefa9f9f0c31680b8f4f25cf9276841 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Thu, 28 Jan 2021 00:34:53 +0900 Subject: [PATCH 16/48] fix byte-level BPE issue (an encoded SPC char) --- lama/modules/hfroberta_connector.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index b7dae03..3a32177 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -53,6 +53,8 @@ def convert_word(word): # Redefine symbol EOS to improve visualization. return ROBERTA_EOS # OPENAI_EOS # return word[:-4] if word.endswith('') else f'{word}##' + if word.startswith('Ġ'): + return word[1:] return word[:-4] if word.endswith('') else f'{word}' _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items())) From 63a2eb727af884456058229df755476caaf596a1 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Thu, 28 Jan 2021 01:25:36 +0900 Subject: [PATCH 17/48] for experiment --- scripts/run_experiments.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 8bae3ff..99973bb 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -63,14 +63,14 @@ # "gpt_model_name": "openai-gpt", # "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt/", #}, - #{ - # "lm": "roberta", - # "label": "roberta.base", - # "models_names": ["roberta"], - # "roberta_model_name": "model.pt", - # "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", - # "roberta_vocab_name": "dict.txt", - #}, + { + "lm": "roberta", + "label": "roberta.base", + "models_names": ["roberta"], + "roberta_model_name": "model.pt", + "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", + "roberta_vocab_name": "dict.txt", + }, { "lm": "hfroberta", "label": "roberta-base", From 06b406692eb52000a4d106a6067557582c045323 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Thu, 28 Jan 2021 01:25:50 +0900 Subject: [PATCH 18/48] for experiment --- scripts/batch_eval_KB_completion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index 1b83d49..82bdeba 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -157,7 +157,8 @@ def run_thread(arguments): label_index=arguments["label_index"], index_list=arguments["index_list"], print_generation=arguments["interactive"], - topk=10000, +# topk=10000, + topk=5000, ) msg += "\n" + return_msg From 06ba24bec53e834ab6fc32547792368e91b36435 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Thu, 28 Jan 2021 01:27:23 +0900 Subject: [PATCH 19/48] clean --- lama/modules/hfroberta_connector.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index 3a32177..13f551e 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -46,6 +46,13 @@ def __init__(self, args): # may form a full word look the same. # Note that we should be very careful now, # tokenizer.convert_tokens_to_ids won't work with our vocabulary. + + # RoBERTa also uses BPE. the bytes_to_unicode function takes all control + # and whitespace characters in code points 0-255 and shifts them up + # by 256 to make them printable. So space (code point 32) becomes Ġ (code point 288). + # (copied from https://github.com/openai/gpt-2/issues/80#issuecomment-487202159). + # + # Other control characters will be removed during voca_intersection process. def convert_word(word): if word == ROBERTA_UNK: # word == OPENAI_UNK: return word @@ -53,9 +60,7 @@ def convert_word(word): # Redefine symbol EOS to improve visualization. return ROBERTA_EOS # OPENAI_EOS # return word[:-4] if word.endswith('') else f'{word}##' - if word.startswith('Ġ'): - return word[1:] - return word[:-4] if word.endswith('') else f'{word}' + return word[1:] if word.startswith('Ġ') else word _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items())) self.vocab = [convert_word(word) for word in gpt_vocab] @@ -216,6 +221,9 @@ def get_batch_generation(self, sentences_list, logger=None, try_cuda=True): ) if isinstance(logits, tuple): # ケースによって、tupleだったり、そうでなかったり.. logits = logits[0] + print('########################') + print(f'hfroberta logits type is {str(type(logits))}') + print('########################') log_probs = F.log_softmax(logits, dim=-1).cpu() From b1d7fc8cda72eca2864833a9bc31e5161d6728a6 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Thu, 28 Jan 2021 01:27:33 +0900 Subject: [PATCH 20/48] add memo --- README_morioka.md | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/README_morioka.md b/README_morioka.md index d8083cb..49c40fb 100644 --- a/README_morioka.md +++ b/README_morioka.md @@ -22,11 +22,14 @@ Elmo, RoBERTa も ConceptNet での評価中におそらくメモリサイズの * Elmoモデルの状態クリアを追加 * https://github.com/facebookresearch/LAMA/issues/30 * GPT, RoBERTa 向け pre-trained_language_models を定義 - * RoBERTa向けモデルダウンロード、vocaburaryのintersection取得の修正 - * huggingface roberta-base でなくfairseq roberta.baseを利用するconnectorコードらしい。 - * https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.md + * FairSeq/HuggingFace Roberta向けSpecial tokenを定義 + * Fairseq RoBERTa向けモデルダウンロード、vocaburaryのintersection取得を修正 + * HuggingFace RoBERTa向けモデルダウンロード, HfRobertaConnectorを作成 + * https://github.com/facebookresearch/LAMA/issues/15 + * bert_connectorを基本に gpt_connector のtokernizerの扱いを適用 + * MRR, precision計算での len(list_of_results) == 0 の場合を処理 * CUDAが利用できない場合の警告メッセージを抑制 - + * 実行 * "The LAMA probe" の手順をそのまま実行。 @@ -55,12 +58,41 @@ Elmo, RoBERTa も ConceptNet での評価中におそらくメモリサイズの * Elmo-5B .. 未実施 * Transformer-XL .. RuntimeError: $ Torch: invalid memory size -- maybe an overflow? at /pytorch/aten/src/TH/THGeneral.cpp:188 エラー。 * GPT .. 大量の word FOO from vocab_subset in model vocabulary! 警告が表示され、評価回数が0となって div0 エラー。 - * RoBERTa .. BERTよりも少し悪い。ConceptNetの評価中にメモリ確保エラー。 + * Fairseq RoBERTa .. BERTよりも少し悪い。ConceptNetの評価中にメモリ確保エラー。 * https://github.com/facebookresearch/LAMA/issues/16 + * HuggingFace RoBERTa .. * TODO * P27に対応する文テンプレートが T-REx では不適切 https://github.com/facebookresearch/LAMA/issues/40 - * fairseq RoBERTaでなくhuggingface RoBERTaをロードしたい https://github.com/facebookresearch/LAMA/issues/15 + +* 疑問 + * len(common_vocab) が小さければ、メモリに関するエラーは生じない? + * fairseq roberta.base に比べて huggingface roberta-base の成績が明確に悪い + * 語彙は同じ? --> おそらく違う + * elmo, elmo5B, bert-base, bert-lage ... 21107 + * + fairseq roberta.base ... 18129 + * + huggingface roberta-base ... 5254 + * 語彙を出力してみる。 + * 'Ġ' の扱い。SPCは'Ġ'にされる。ほか制御文字の扱い。可読文字ではないし共通語委作成時に削除されるはず + * https://github.com/huggingface/transformers/issues/3867#issuecomment-616956437 + * https://github.com/openai/gpt-2/issues/80#issuecomment-487202159 + * word 先頭の 'Ġ' だけ削除して + hf roberta-base ... 18117 .. ほぼ一致とみてよいか。 + * 予測も同じ? was_born_in だけででも。 + * precision@10でも悪いので、本当に悪いのかも。 + * vocab_intersection での除外条件が雑な気がする。これしかないのかもしれないが。 + * nlp = spacy('en') に基づく。 + * stop_word は上記 nlp が返すもの + * punctuation, synonym(symbol?)は各wordを nlp(word)の入力として tokenize結果のPOSで判断。よって数字だけでもPUNCTと判断されるケースがたびたび生じている。文脈が不足しているのだろう。 + +* メモ + * 語彙の大きさは同等だがこれまで以上にひどい。 + * intersectionをとるまでは、やたら制御文字だけ文がある。 + * 実際にデコードさせタ結果を確認する必要がある。 + * "" の前にSPACEをつけて " "とする必要がある? + * 文頭にもSPACEをつける? + * SPACEがついていないということは分割されたが先頭ではないトークンということで、そこに区別はあるのか? + * 一致をどうやって見ているのか。デコード時に適宜 SPACEを付与してやる必要があるのか。 + ## 参考 From 1ea76aab60cf790ce6050d5896562a99a2563859 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Fri, 29 Jan 2021 07:55:45 +0900 Subject: [PATCH 21/48] fix hfroberta_connector --- lama/modules/hfroberta_connector.py | 56 +++++++++++++++++++---------- scripts/batch_eval_KB_completion.py | 3 ++ 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index 13f551e..90b7b07 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -36,7 +36,6 @@ def __init__(self, args): self.tokenizer = RobertaTokenizer.from_pretrained(dict_file) # original vocab - self.map_indices = None # GPT uses different way to represent BPE then BERT. Namely, the # final suffixes are indicated with suffix, while pieces that must @@ -53,14 +52,37 @@ def __init__(self, args): # (copied from https://github.com/openai/gpt-2/issues/80#issuecomment-487202159). # # Other control characters will be removed during voca_intersection process. - def convert_word(word): + def convert_word0(word): if word == ROBERTA_UNK: # word == OPENAI_UNK: return word if word == '\n': # Redefine symbol EOS to improve visualization. return ROBERTA_EOS # OPENAI_EOS # return word[:-4] if word.endswith('') else f'{word}##' - return word[1:] if word.startswith('Ġ') else word + return word[:-4] if word.endswith('') else f'{word}' + + def convert_word(word): + # return convert_word0(word) + + if word == ROBERTA_UNK: + return word + if word == ROBERTA_MASK: + return word + if word == ROBERTA_START_SENTENCE: + return word + if word == ROBERTA_END_SENTENCE: + return word + if word == ROBERTA_PAD: + return word + + if word.startswith('Ġ'): # the token starts with a whitespace + return word[1:] + + return f'_{word}_' # the token not start with a white space. + # may be not a head of a word, + # or may be a head of a sentence. + + # need duplitation check? _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items())) self.vocab = [convert_word(word) for word in gpt_vocab] @@ -94,22 +116,22 @@ def convert_word(word): self.mask_index = mask_index def __get_token_ids_from_tensor(self, indexed_string): - token_ids = [] - if self.map_indices is not None: - # map indices to subset of the vocabulary - indexed_string = self.convert_ids(indexed_string) - token_ids = np.asarray(indexed_string) - else: - token_ids = indexed_string + token_ids = indexed_string return token_ids def _cuda(self): self.masked_roberta_model.cuda() def get_id(self, string): + try: + return [ self.inverse_vocab[string] ] + except: + tokenized_text = self.tokenizer.tokenize(f' {string}') + indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) + return indexed_string + tokenized_text = self.tokenizer.tokenize(string) indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) - # indexed_string = self.convert_ids(indexed_string) return indexed_string def __get_input_tensors_batch(self, sentences_list): @@ -167,12 +189,14 @@ def __get_input_tensors(self, sentences): for sentence_idx, sentence in enumerate(sentences): if sentence_idx > 0: tokenized_text.append(ROBERTA_END_SENTENCE) # OPENAI_EOS) + + sentence = ' ' + sentence for chunk_idx, chunk in enumerate(sentence.split('[MASK]')): if chunk_idx > 0: masked_indices.append(len(tokenized_text)) segment_indices.append(sentence_idx) tokenized_text.append(self.mask_symbol) - chunk = chunk.strip() + #chunk = chunk.strip() if chunk: tokenized_sentence = self.tokenizer.tokenize(chunk) segment_id = np.full(len(tokenized_sentence), @@ -207,7 +231,7 @@ def get_batch_generation(self, sentences_list, logger=None, try_cuda=True): return None if try_cuda: self.try_cuda() - + #print(sentences_list) tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(sentences_list) if logger is not None: @@ -219,11 +243,7 @@ def get_batch_generation(self, sentences_list, logger=None, try_cuda=True): token_type_ids=segments_tensor.to(self._model_device), attention_mask=attention_mask_tensor.to(self._model_device), ) - if isinstance(logits, tuple): # ケースによって、tupleだったり、そうでなかったり.. - logits = logits[0] - print('########################') - print(f'hfroberta logits type is {str(type(logits))}') - print('########################') + logits = logits[0] log_probs = F.log_softmax(logits, dim=-1).cpu() diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index 82bdeba..b5d07af 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -230,6 +230,7 @@ def filter_samples(model, samples, vocab_subset, max_sentence_length, template): if "obj_label" in sample and "sub_label" in sample: obj_label_ids = model.get_id(sample["obj_label"]) +# print(f'obj_label: {sample["obj_label"]} -> {obj_label_ids}') if obj_label_ids: recostructed_word = " ".join( @@ -238,6 +239,8 @@ def filter_samples(model, samples, vocab_subset, max_sentence_length, template): else: recostructed_word = None +# print(f'reconstructed_word: {recostructed_word}') + excluded = False if not template or len(template) == 0: masked_sentences = sample["masked_sentences"] From 59dbc4786b14a7962aa45a531e7c0bd70a48bcd0 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Fri, 29 Jan 2021 08:46:45 +0900 Subject: [PATCH 22/48] fix last_results.csv --- scripts/run_experiments.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 99973bb..364fdcd 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -101,7 +101,11 @@ def run_experiments( type_Precision1 = defaultdict(list) type_count = defaultdict(list) - results_file = open("last_results.csv", "w+") + results_file = open("last_results.csv", "a+") + results_file.write( + "=={}==\n".format(input_param["label"]) + ) + results_file.flush() for relation in relations: pp.pprint(relation) From 01aa1d81a24174f853b56e7a9a5d82efc97c2e56 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Fri, 29 Jan 2021 23:26:06 +0900 Subject: [PATCH 23/48] clean --- README_morioka.md | 47 +++++++++-------------------- download_models.sh | 2 +- lama/modules/base_connector.py | 2 +- lama/modules/hfroberta_connector.py | 2 +- 4 files changed, 18 insertions(+), 35 deletions(-) diff --git a/README_morioka.md b/README_morioka.md index 49c40fb..973cd18 100644 --- a/README_morioka.md +++ b/README_morioka.md @@ -1,17 +1,20 @@ # LAMA(LAnguage Model Analysis) を再確認してみた 2021-01-12 +2021-01-29 更新 Yasuhiro MORIOKA ## 概要 -LAMA(LAnguage Model Analysis) の環境をそのまま使って、結果の再現を試みた。README.md の手順どおりに進めただけで、独自のデータセットやモデルで確認していない。 +LAMA(LAnguage Model Analysis) の環境をそのまま使って、結果の再現を試みた。 +README.md の手順どおりに進めただけで、独自のデータセットやモデルで確認していない。 -BERT, BERT-large, Elmoについては Google-RE, T-REx でほぼ同様の結果を得た。Elmo-5Bは未確認。 +BERT, BERT-large, Elmo については Google-RE, T-REx でほぼ同様の結果を得た。Elmo-5Bは未確認。 +Transformer-XL, GPTはおそらくメモリサイズの問題で実行できない。 +メモリサイズの問題か ConceptNetの評価中に強制終了する。 -Transformer-XL, GPTはおそらくメモリサイズの問題で実行不可。 -Elmo, RoBERTa も ConceptNet での評価中におそらくメモリサイズの問題で実行不可。 +RoBERTaは配布状態ではサポートされないが、動作させた。BERTより少し劣る結果。 ## 内容 @@ -21,12 +24,13 @@ Elmo, RoBERTa も ConceptNet での評価中におそらくメモリサイズの * 修正 * Elmoモデルの状態クリアを追加 * https://github.com/facebookresearch/LAMA/issues/30 - * GPT, RoBERTa 向け pre-trained_language_models を定義 + * 非効率だが relation 別の評価のたびに¥モデルをクリア。 + * GPT, FairSeq RoBERTa 向け pre-trained_language_models を定義 * FairSeq/HuggingFace Roberta向けSpecial tokenを定義 * Fairseq RoBERTa向けモデルダウンロード、vocaburaryのintersection取得を修正 * HuggingFace RoBERTa向けモデルダウンロード, HfRobertaConnectorを作成 * https://github.com/facebookresearch/LAMA/issues/15 - * bert_connectorを基本に gpt_connector のtokernizerの扱いを適用 + * bert_connectorを基本に gpt_connector のtokernizerの扱いを流用。 * MRR, precision計算での len(list_of_results) == 0 の場合を処理 * CUDAが利用できない場合の警告メッセージを抑制 @@ -60,39 +64,18 @@ Elmo, RoBERTa も ConceptNet での評価中におそらくメモリサイズの * GPT .. 大量の word FOO from vocab_subset in model vocabulary! 警告が表示され、評価回数が0となって div0 エラー。 * Fairseq RoBERTa .. BERTよりも少し悪い。ConceptNetの評価中にメモリ確保エラー。 * https://github.com/facebookresearch/LAMA/issues/16 - * HuggingFace RoBERTa .. + * HuggingFace RoBERTa .. Fairseq RoBERTaと同様。 + * 各種エラーはメモリサイズさえ大きければ生じない印象。 * TODO * P27に対応する文テンプレートが T-REx では不適切 https://github.com/facebookresearch/LAMA/issues/40 * 疑問 - * len(common_vocab) が小さければ、メモリに関するエラーは生じない? - * fairseq roberta.base に比べて huggingface roberta-base の成績が明確に悪い - * 語彙は同じ? --> おそらく違う - * elmo, elmo5B, bert-base, bert-lage ... 21107 - * + fairseq roberta.base ... 18129 - * + huggingface roberta-base ... 5254 - * 語彙を出力してみる。 - * 'Ġ' の扱い。SPCは'Ġ'にされる。ほか制御文字の扱い。可読文字ではないし共通語委作成時に削除されるはず - * https://github.com/huggingface/transformers/issues/3867#issuecomment-616956437 - * https://github.com/openai/gpt-2/issues/80#issuecomment-487202159 - * word 先頭の 'Ġ' だけ削除して + hf roberta-base ... 18117 .. ほぼ一致とみてよいか。 - * 予測も同じ? was_born_in だけででも。 - * precision@10でも悪いので、本当に悪いのかも。 - * vocab_intersection での除外条件が雑な気がする。これしかないのかもしれないが。 + * vocab_intersection での除外条件が粗い印象。これしかないのかもしれないが。 * nlp = spacy('en') に基づく。 * stop_word は上記 nlp が返すもの - * punctuation, synonym(symbol?)は各wordを nlp(word)の入力として tokenize結果のPOSで判断。よって数字だけでもPUNCTと判断されるケースがたびたび生じている。文脈が不足しているのだろう。 - -* メモ - * 語彙の大きさは同等だがこれまで以上にひどい。 - * intersectionをとるまでは、やたら制御文字だけ文がある。 - * 実際にデコードさせタ結果を確認する必要がある。 - * "" の前にSPACEをつけて " "とする必要がある? - * 文頭にもSPACEをつける? - * SPACEがついていないということは分割されたが先頭ではないトークンということで、そこに区別はあるのか? - * 一致をどうやって見ているのか。デコード時に適宜 SPACEを付与してやる必要があるのか。 - + * punctuation, symbolは各wordを nlp(word)の入力として tokenize結果のPOSで判断。よって数字だけでもPUNCTと判断されるケースがたびたび生じている。文脈が不足しているのだろう。 + * BPEを利用しているので、同じ語でも " word" と "word" の2つのトークンが対応する(はず)。前者は文中で登場するもの。後者は文頭に登場するか、語頭に続くものとして登場する。LAMAは1単語が複数トークンに分かれるケースを除外しているので、後者は語彙から除くことになる。しかし、文頭に登場する場合は前者と同様の意味を持つのではないか? obj-rel-subj のうちobjは文頭に出現することが多いだろうと考えると、そこで情報が失われていることはないのだろうか。まずはそこは捨てて考えてみているということか? ## 参考 diff --git a/download_models.sh b/download_models.sh index 3ee8074..6c6b52a 100755 --- a/download_models.sh +++ b/download_models.sh @@ -161,6 +161,6 @@ if [ ! -f "$DST_DIR/common_vocab_cased.txt" ]; then python lama/vocab_intersection.py else echo 'Already exists. Run to re-build:' - echo 'python util_KB_completion.py' + echo 'python lama/vocab_intersection.py' fi diff --git a/lama/modules/base_connector.py b/lama/modules/base_connector.py index ea0fef1..6f80bf3 100644 --- a/lama/modules/base_connector.py +++ b/lama/modules/base_connector.py @@ -97,7 +97,7 @@ def try_cuda(self): """Move model to GPU if one is available.""" if torch.cuda.is_available(): if self._model_device != 'cuda': - print('Moving model to CUDA') + # print('Moving model to CUDA') self._cuda() self._model_device = 'cuda' # else: diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index 90b7b07..0990d44 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -190,7 +190,7 @@ def __get_input_tensors(self, sentences): if sentence_idx > 0: tokenized_text.append(ROBERTA_END_SENTENCE) # OPENAI_EOS) - sentence = ' ' + sentence + sentence = ' ' + sentence # add " " to the head of sentence for chunk_idx, chunk in enumerate(sentence.split('[MASK]')): if chunk_idx > 0: masked_indices.append(len(tokenized_text)) From 43c764e8f2a53de8a95af084f6c652a76eeec7bf Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Fri, 29 Jan 2021 23:39:12 +0900 Subject: [PATCH 24/48] add hfroberta args to options --- README.md | 8 +++++++- lama/options.py | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 36adb5b..8ca3193 100644 --- a/README.md +++ b/README.md @@ -185,13 +185,19 @@ BERT pretrained models can be loaded both: (i) passing the name of the model and * __--bert-vocab-name/--bvn__ : name of vocabulary used to pre-train the BERT model (default = 'vocab.txt') -### RoBERTa +### RoBERTa (Fairseq) * __--roberta-model-dir/--rmd__ : directory that contains the RoBERTa pre-trained model and the vocabulary (__REQUIRED__) * __--roberta-model-name/--rmn__ : name of the RoBERTa pre-trained model (default = 'model.pt') * __--roberta-vocab-name/--rvn__ : name of vocabulary used to pre-train the RoBERTa model (default = 'dict.txt') +### RoBERTa (HuggingFace) + +* __--hfroberta-model-dir/--hmd__ : directory that contains the HuggingFace RoBERTa pre-trained model and the vocabulary (__REQUIRED__) +* __--hfroberta-model-name/--hmn__ : name of the HuggingFace RoBERTa pre-trained model (default = 'roberta-base') + + ### ELMo * __--elmo-model-dir/--emd__ : directory that contains the ELMo pre-trained model and the vocabulary (__REQUIRED__) diff --git a/lama/options.py b/lama/options.py index 1f4a552..cfd1284 100644 --- a/lama/options.py +++ b/lama/options.py @@ -48,6 +48,7 @@ def get_general_parser(): __add_gpt_args(parser) __add_transformerxl_args(parser) __add_roberta_args(parser) + __add_hfroberta_args(parser) return parser @@ -156,6 +157,24 @@ def __add_roberta_args(parser): return group +def __add_hfroberta_args(parser): + group = parser.add_argument_group("HuggingFace RoBERTa") + group.add_argument( + "--hfroberta-model-dir", + "--hmd", + dest="hfroberta_model_dir", + help="directory that contains the HuggingFace ROBERTA pre-trained model and the vocabulary", + ) + group.add_argument( + "--hfroberta-model-name", + "--hmn", + dest="hfroberta_model_name", + default="roberta-base", + help="name of the HuggingFace ROBERTA pre-trained model (default = 'model.pt')", + ) + return group + + def __add_gpt_args(parser): group = parser.add_argument_group("GPT") group.add_argument( From 392c2b28b516f482a7d305f2a0b53f2552546a4f Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 1 Feb 2021 21:59:13 +0900 Subject: [PATCH 25/48] fix model dir --- lama/vocab_intersection.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/lama/vocab_intersection.py b/lama/vocab_intersection.py index 7292a8b..a70da3e 100644 --- a/lama/vocab_intersection.py +++ b/lama/vocab_intersection.py @@ -68,7 +68,7 @@ # "hfRoBERTa base" "lm" : "hfroberta", "hfroberta_model_name": "roberta-base", - "hfroberta_model_dir": None, + "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", } ] @@ -79,20 +79,23 @@ # "BERT BASE UNCASED" "lm": "bert", "bert_model_name": "bert-base-uncased", - "bert_model_dir": None, + #"bert_model_dir": None, + "bert_model_dir": "pre-trained_language_models/bert/uncased_L-12_H-768_A-12", "bert_vocab_name": "vocab.txt" }, { # "BERT LARGE UNCASED" "lm": "bert", "bert_model_name": "bert-large-uncased", - "bert_model_dir": None, + #"bert_model_dir": None, + "bert_model_dir": "pre-trained_language_models/bert/uncased_L-24_H-1024_A-16", "bert_vocab_name": "vocab.txt" }, { # "OpenAI GPT" "lm": "gpt", - "gpt_model_dir": None, + #"gpt_model_dir": None, + "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt", "gpt_model_name": "openai-gpt" } ] From 1cd6b86f97c6b2c74392d956b7e8558a6ac80bdf Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 1 Feb 2021 22:03:30 +0900 Subject: [PATCH 26/48] fix gpt config --- scripts/run_experiments.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 364fdcd..a6c20e2 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -42,13 +42,13 @@ # "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B/", # "elmo_warm_up_cycles": 10, #}, - { - "lm": "bert", - "label": "bert_base", - "models_names": ["bert"], - "bert_model_name": "bert-base-cased", - "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", - }, + #{ + # "lm": "bert", + # "label": "bert_base", + # "models_names": ["bert"], + # "bert_model_name": "bert-base-cased", + # "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", + #}, #{ # "lm": "bert", # "label": "bert_large", @@ -62,15 +62,17 @@ # "models_names": ["gpt"], # "gpt_model_name": "openai-gpt", # "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt/", + # "lowercase": True, + # "common_vocab_filename": "pre-trained_language_models/common_vocab_lowercased.txt", + #}, + #{ + # "lm": "roberta", + # "label": "roberta.base", + # "models_names": ["roberta"], + # "roberta_model_name": "model.pt", + # "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", + # "roberta_vocab_name": "dict.txt", #}, - { - "lm": "roberta", - "label": "roberta.base", - "models_names": ["roberta"], - "roberta_model_name": "model.pt", - "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", - "roberta_vocab_name": "dict.txt", - }, { "lm": "hfroberta", "label": "roberta-base", From 2aed3022f1ebe5e2388a2a356761825303a20643 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 1 Feb 2021 22:55:53 +0900 Subject: [PATCH 27/48] hide gpt config --- lama/modules/gpt_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lama/modules/gpt_connector.py b/lama/modules/gpt_connector.py index 7488cf6..a0a96bc 100644 --- a/lama/modules/gpt_connector.py +++ b/lama/modules/gpt_connector.py @@ -54,7 +54,7 @@ def convert_word(word): # Load pre-trained model (weights) self.gpt_model = OpenAIGPTLMHeadModel.from_pretrained(gpt_model_name) self.gpt_model.eval() - print(self.gpt_model.config) + # print(self.gpt_model.config) # Sanity check. assert len(self.vocab) == self.gpt_model.config.vocab_size From 6163e1af42fd344fc91ae3907a245bf9dc2dd8d0 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 1 Feb 2021 23:32:25 +0900 Subject: [PATCH 28/48] fix hfroberta get_id --- lama/modules/hfroberta_connector.py | 39 +++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index 0990d44..6fb57f7 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -123,6 +123,45 @@ def _cuda(self): self.masked_roberta_model.cuda() def get_id(self, string): + tokenized_text = self.tokenizer.tokenize(f'i {string}') + print(tokenized_text) + tokenized_text = tokenized_text[2:-1] + indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) + return indexed_string + + def get_id2(self, string): + try : + ''' + " {string}"としてもtokenizerでは先頭SPACEを抜いてトークン列を返す。 + よって、そのままではSPACEにつづく語頭ではないトークンと思われる。 + 文頭に出てくるパターンのみ。 + なので、まずはとーかないずされたとして、idにもどせるか。 + さもなくば、しかたないので複数トークにわかれるかもしれないが、そのあとで先頭にくうあ箔をつけてみる。 + objectiveをたいしょうなので、文頭に来ることはない。よってSPACEを先頭にもトークンになるはず。 + + 文頭と文中で、ちがった分割がされるかのうせもい可能性もあるし。 + + GPTはトークンの最後にSPACEの /w をつける。 + fairseq robertaは? + ''' + tokenized_text = [ f'Ġ{string}' ] + indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) + #print("A", indexed_string) + except: + tokenized_text = self.tokenizer.tokenize(string) + tokenized_text[0] = f'Ġ{tokenized_text[0]}' + indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) + print("B", indexed_string) + return indexed_string + + def get_id1(self, string): + tokenized_text = self.tokenizer.tokenize(string) + tokenized_text[0] = f'Ġ{tokenized_text[0]}' + indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) + # indexed_string = self.convert_ids(indexed_string) + return indexed_string + + def get_id0(self, string): try: return [ self.inverse_vocab[string] ] except: From 1d4c38aca70e37fe29b9a2dc6ca42de0d5c66da6 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 1 Feb 2021 23:49:08 +0900 Subject: [PATCH 29/48] clean hfroberta get_id --- lama/modules/hfroberta_connector.py | 67 +++++++++++------------------ 1 file changed, 24 insertions(+), 43 deletions(-) diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index 6fb57f7..addacca 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -123,53 +123,34 @@ def _cuda(self): self.masked_roberta_model.cuda() def get_id(self, string): - tokenized_text = self.tokenizer.tokenize(f'i {string}') - print(tokenized_text) - tokenized_text = tokenized_text[2:-1] - indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) - return indexed_string + ''' + (obj を示す)stringをtokenizeしてid列に直す。 - def get_id2(self, string): - try : - ''' - " {string}"としてもtokenizerでは先頭SPACEを抜いてトークン列を返す。 - よって、そのままではSPACEにつづく語頭ではないトークンと思われる。 - 文頭に出てくるパターンのみ。 - なので、まずはとーかないずされたとして、idにもどせるか。 - さもなくば、しかたないので複数トークにわかれるかもしれないが、そのあとで先頭にくうあ箔をつけてみる。 - objectiveをたいしょうなので、文頭に来ることはない。よってSPACEを先頭にもトークンになるはず。 - - 文頭と文中で、ちがった分割がされるかのうせもい可能性もあるし。 - - GPTはトークンの最後にSPACEの /w をつける。 - fairseq robertaは? - ''' - tokenized_text = [ f'Ġ{string}' ] - indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) - #print("A", indexed_string) - except: - tokenized_text = self.tokenizer.tokenize(string) - tokenized_text[0] = f'Ġ{tokenized_text[0]}' - indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) - print("B", indexed_string) - return indexed_string + このときstringは先頭がSPACEで始まらない。 - def get_id1(self, string): - tokenized_text = self.tokenizer.tokenize(string) - tokenized_text[0] = f'Ġ{tokenized_text[0]}' - indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) - # indexed_string = self.convert_ids(indexed_string) - return indexed_string + しかし、内部のvocabは先頭がSPACEで始まる場合に限っている。 + + + " {string}"としてもtokenizerでは先頭SPACEを抜いてトークン列を返す。 + よって、そのままではSPACEにつづく語頭ではないトークンと思われる。 + 文頭に出てくるパターンのみ。 + なので、まずはとーかないずされたとして、idにもどせるか。 + さもなくば、しかたないので複数トークにわかれるかもしれないが、そのあとで先頭にくうあ箔をつけてみる。 + objectiveをたいしょうなので、文頭に来ることはない。よってSPACEを先頭にもトークンになるはず。 + + 文頭と文中で、ちがった分割がされるかのうせもい可能性もあるし。 - def get_id0(self, string): - try: - return [ self.inverse_vocab[string] ] - except: - tokenized_text = self.tokenizer.tokenize(f' {string}') - indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) - return indexed_string + GPTはトークンの最後にSPACEの /w をつける。 + fairseq robertaは? あれは自前でうまくやっている - tokenized_text = self.tokenizer.tokenize(string) + 最終的に, で囲むが、これは不要かもしれない。 + それよりも先頭にかならうず1トークンになるだろう 'i' を string前のSPACEの前に置くことで、 + 強制的に、非文頭の単語としてとーかないずされるように持ってくる + ''' + #tokenized_text = self.tokenizer.tokenize(f'i {string}') # '', 'i', '' はそれぞれ1トークンになるはず + #tokenized_text = tokenized_text[2:-1] + tokenized_text = self.tokenizer.tokenize(f'a {string}') # 'a' はそれぞれ1トークンになるはず + tokenized_text = tokenized_text[1:] indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) return indexed_string From 71c675ec7be328c264dea4600cc992424c5a4570 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Tue, 2 Feb 2021 00:36:21 +0900 Subject: [PATCH 30/48] clean hfroberta_connector --- lama/modules/hfroberta_connector.py | 50 +++-------------------------- 1 file changed, 4 insertions(+), 46 deletions(-) diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index addacca..35a2f23 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -37,14 +37,7 @@ def __init__(self, args): # original vocab - # GPT uses different way to represent BPE then BERT. Namely, the - # final suffixes are indicated with suffix, while pieces that must - # be followed are written as is. In BERT the prefixes are written as is - # while the parts that must follow (not be followed!) have '##' prefix. - # There is no one-to-one coversion. But at least we may make pieces that - # may form a full word look the same. - # Note that we should be very careful now, - # tokenizer.convert_tokens_to_ids won't work with our vocabulary. + # The following process is baded on gpt_connector. # RoBERTa also uses BPE. the bytes_to_unicode function takes all control # and whitespace characters in code points 0-255 and shifts them up @@ -52,18 +45,7 @@ def __init__(self, args): # (copied from https://github.com/openai/gpt-2/issues/80#issuecomment-487202159). # # Other control characters will be removed during voca_intersection process. - def convert_word0(word): - if word == ROBERTA_UNK: # word == OPENAI_UNK: - return word - if word == '\n': - # Redefine symbol EOS to improve visualization. - return ROBERTA_EOS # OPENAI_EOS - # return word[:-4] if word.endswith('') else f'{word}##' - return word[:-4] if word.endswith('') else f'{word}' - def convert_word(word): - # return convert_word0(word) - if word == ROBERTA_UNK: return word if word == ROBERTA_MASK: @@ -123,33 +105,9 @@ def _cuda(self): self.masked_roberta_model.cuda() def get_id(self, string): - ''' - (obj を示す)stringをtokenizeしてid列に直す。 - - このときstringは先頭がSPACEで始まらない。 - - しかし、内部のvocabは先頭がSPACEで始まる場合に限っている。 - - - " {string}"としてもtokenizerでは先頭SPACEを抜いてトークン列を返す。 - よって、そのままではSPACEにつづく語頭ではないトークンと思われる。 - 文頭に出てくるパターンのみ。 - なので、まずはとーかないずされたとして、idにもどせるか。 - さもなくば、しかたないので複数トークにわかれるかもしれないが、そのあとで先頭にくうあ箔をつけてみる。 - objectiveをたいしょうなので、文頭に来ることはない。よってSPACEを先頭にもトークンになるはず。 - - 文頭と文中で、ちがった分割がされるかのうせもい可能性もあるし。 - - GPTはトークンの最後にSPACEの /w をつける。 - fairseq robertaは? あれは自前でうまくやっている - - 最終的に, で囲むが、これは不要かもしれない。 - それよりも先頭にかならうず1トークンになるだろう 'i' を string前のSPACEの前に置くことで、 - 強制的に、非文頭の単語としてとーかないずされるように持ってくる - ''' - #tokenized_text = self.tokenizer.tokenize(f'i {string}') # '', 'i', '' はそれぞれ1トークンになるはず - #tokenized_text = tokenized_text[2:-1] - tokenized_text = self.tokenizer.tokenize(f'a {string}') # 'a' はそれぞれ1トークンになるはず + # tokenize "a " + string, in order to create token_id(s) corresponding to the string. + # the first token of the string starts with a whitespace. + tokenized_text = self.tokenizer.tokenize(f'a {string}') tokenized_text = tokenized_text[1:] indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text) return indexed_string From 6cd41c143c475d946b23f06a6a1ece6a6e9bddf1 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Tue, 2 Feb 2021 09:12:14 +0900 Subject: [PATCH 31/48] fix --- lama/modules/base_connector.py | 2 +- scripts/batch_eval_KB_completion.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lama/modules/base_connector.py b/lama/modules/base_connector.py index 6f80bf3..301ade2 100644 --- a/lama/modules/base_connector.py +++ b/lama/modules/base_connector.py @@ -17,7 +17,7 @@ ELMO_END_SENTENCE = "" OPENAI_UNK = "" OPENAI_EOS = "" -ROBERTA_MASK = "" +ROBERTA_MASK = "" ROBERTA_START_SENTENCE = "" ROBERTA_END_SENTENCE = "" ROBERTA_VOCAB_SIZE = 50266 diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index b5d07af..ac806c0 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -157,8 +157,7 @@ def run_thread(arguments): label_index=arguments["label_index"], index_list=arguments["index_list"], print_generation=arguments["interactive"], -# topk=10000, - topk=5000, + topk=10000, ) msg += "\n" + return_msg From 726cf80ef042a7ab1d5e95161c2185a7ef5421c7 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 15 Feb 2021 01:37:20 +0900 Subject: [PATCH 32/48] fix BOS, EOS tokens for hfRoBERTa --- lama/modules/hfroberta_connector.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index 35a2f23..418a638 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -166,15 +166,15 @@ def __get_input_tensors(self, sentences): segment_indices = [] for sentence_idx, sentence in enumerate(sentences): if sentence_idx > 0: - tokenized_text.append(ROBERTA_END_SENTENCE) # OPENAI_EOS) + tokenized_text.append(ROBERTA_START_SENTENCE) + segment_indices.append(sentence_idx) - sentence = ' ' + sentence # add " " to the head of sentence for chunk_idx, chunk in enumerate(sentence.split('[MASK]')): if chunk_idx > 0: masked_indices.append(len(tokenized_text)) segment_indices.append(sentence_idx) tokenized_text.append(self.mask_symbol) - #chunk = chunk.strip() + chunk = chunk.strip() if chunk: tokenized_sentence = self.tokenizer.tokenize(chunk) segment_id = np.full(len(tokenized_sentence), @@ -183,6 +183,8 @@ def __get_input_tensors(self, sentences): tokenized_text.extend(tokenized_sentence) segment_indices.extend(segment_id) + tokenized_text.append(ROBERTA_END_SENTENCE) + segment_indices.append(sentence_idx) # add [CLS] token at the beginning tokenized_text.insert(0,ROBERTA_START_SENTENCE) From 48a5a2adbe1a253c404693cefd7bdc02517a1172 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 15 Feb 2021 01:54:53 +0900 Subject: [PATCH 33/48] clean __get_input_tensors of hfRoBERTA connector --- lama/modules/hfroberta_connector.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index 418a638..06321b2 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -165,15 +165,12 @@ def __get_input_tensors(self, sentences): masked_indices = [] segment_indices = [] for sentence_idx, sentence in enumerate(sentences): - if sentence_idx > 0: - tokenized_text.append(ROBERTA_START_SENTENCE) - segment_indices.append(sentence_idx) - for chunk_idx, chunk in enumerate(sentence.split('[MASK]')): if chunk_idx > 0: masked_indices.append(len(tokenized_text)) segment_indices.append(sentence_idx) tokenized_text.append(self.mask_symbol) + chunk = chunk.strip() if chunk: tokenized_sentence = self.tokenizer.tokenize(chunk) @@ -183,10 +180,12 @@ def __get_input_tensors(self, sentences): tokenized_text.extend(tokenized_sentence) segment_indices.extend(segment_id) + + # add [EOS] or [SEP] token at the end of sequence or sentence tokenized_text.append(ROBERTA_END_SENTENCE) segment_indices.append(sentence_idx) - # add [CLS] token at the beginning + # add [CLS] or [BOS] token at the beginning tokenized_text.insert(0,ROBERTA_START_SENTENCE) segment_indices.insert(0,0) From 6eb12ad2af15142e16febfcc26705b02047454ad Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 15 Feb 2021 22:59:12 +0900 Subject: [PATCH 34/48] add GPT-2 support --- README.md | 6 ++ download_models.sh | 12 +++ lama/modules/__init__.py | 4 +- lama/modules/base_connector.py | 19 ++-- lama/modules/gpt2_connector.py | 168 +++++++++++++++++++++++++++++++++ lama/options.py | 19 ++++ lama/vocab_intersection.py | 168 +++++++++++++++++---------------- scripts/run_experiments.py | 71 ++++++++------ 8 files changed, 348 insertions(+), 119 deletions(-) create mode 100644 lama/modules/gpt2_connector.py diff --git a/README.md b/README.md index 8ca3193..fc1bd7e 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,12 @@ BERT pretrained models can be loaded both: (i) passing the name of the model and * __--gpt-model-name/--gmn__ : name of the gpt pre-trained model (default = 'openai-gpt') +### GPT2 + +* __--gpt2-model-dir/--g2d__ : directory that contains the gpt2 pre-trained model and the vocabulary (__REQUIRED__) +* __--gpt2-model-name/--g2n__ : name of the gpt2 pre-trained model (default = 'gpt2') + + ## Evaluate Language Model(s) Generation options: diff --git a/download_models.sh b/download_models.sh index 6c6b52a..6694b62 100755 --- a/download_models.sh +++ b/download_models.sh @@ -29,6 +29,18 @@ if [[ ! -f gpt/openai-gpt/config.json ]]; then cd ../.. fi +echo "GPT2" +if [[ ! -f gpt/gpt2/config.json ]]; then + rm -rf 'gpt/gpt2' + mkdir -p 'gpt/gpt2' + cd 'gpt/gpt2' + wget 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json' -O vocab.json + wget 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt' -O merges.txt + wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin' -O 'pytorch_model.bin' + wget -c 'https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json' -O 'config.json' + cd ../.. +fi + echo "BERT BASE LOWERCASED" if [[ ! -f bert/uncased_L-12_H-768_A-12/bert_config.json ]]; then mkdir -p 'bert' diff --git a/lama/modules/__init__.py b/lama/modules/__init__.py index f02e310..d1f4620 100644 --- a/lama/modules/__init__.py +++ b/lama/modules/__init__.py @@ -10,6 +10,7 @@ from .transformerxl_connector import TransformerXL from .roberta_connector import Roberta from .hfroberta_connector import HfRoberta +from .gpt2_connector import GPT2 def build_model_by_name(lm, args, verbose=True): @@ -24,7 +25,8 @@ def build_model_by_name(lm, args, verbose=True): gpt=GPT, transformerxl=TransformerXL, roberta=Roberta, - hfroberta=HfRoberta + hfroberta=HfRoberta, + gpt2=GPT2 ) if lm not in MODEL_NAME_TO_CLASS: raise ValueError("Unrecognized Language Model: %s." % lm) diff --git a/lama/modules/base_connector.py b/lama/modules/base_connector.py index 301ade2..1869afd 100644 --- a/lama/modules/base_connector.py +++ b/lama/modules/base_connector.py @@ -12,17 +12,23 @@ BERT_CLS = "[CLS]" BERT_SEP = "[SEP]" BERT_PAD = "[PAD]" + ELMO_UNK = "" ELMO_START_SENTENCE = "" ELMO_END_SENTENCE = "" + OPENAI_UNK = "" OPENAI_EOS = "" -ROBERTA_MASK = "" -ROBERTA_START_SENTENCE = "" -ROBERTA_END_SENTENCE = "" -ROBERTA_VOCAB_SIZE = 50266 -ROBERTA_UNK = "" -ROBERTA_PAD = "" + +ROBERTA_MASK = "" # MASK for fairseq/huggingface RoBERTa +ROBERTA_VOCAB_SIZE = 50266 # for fairseq RoBERTa + +ROBERTA_START_SENTENCE = "" # BOS, CLS for huggingface RoBERTa +ROBERTA_END_SENTENCE = "" # EOS, SEP for huggingface RoBERTa +ROBERTA_UNK = "" # UNK for huggingface RoBERTa +ROBERTA_PAD = "" # PAD for huggingface RoBERTa + +GPT2_EOS = "<|endoftext|>" # BOS, EOS, UNK, PAD for GPT2 SPECIAL_SYMBOLS = [ @@ -41,6 +47,7 @@ ROBERTA_PAD, ROBERTA_START_SENTENCE, ROBERTA_END_SENTENCE, + GPT2_EOS ] SPACE_NORMALIZER = re.compile(r"\s+") diff --git a/lama/modules/gpt2_connector.py b/lama/modules/gpt2_connector.py new file mode 100644 index 0000000..c3da7e0 --- /dev/null +++ b/lama/modules/gpt2_connector.py @@ -0,0 +1,168 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# +from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer +import numpy as np +from lama.modules.base_connector import * + + +class GPT2(Base_Connector): + + def __init__(self, args): + super().__init__() + + if args.gpt2_model_dir is not None: + # load GPT2 model from file + gpt_model_name = str(args.gpt2_model_dir) + "/" + dict_file = gpt_model_name + print("loading GPT2 model from {}".format(gpt_model_name)) + else: + # load GPT2 model from huggingface cache + gpt_model_name = args.gpt2_model_name + dict_file = gpt_model_name + + # Load pre-trained model tokenizer (vocabulary) + self.tokenizer = GPT2Tokenizer.from_pretrained(dict_file) + + # GPT uses different way to represent BPE then BERT. Namely, the + # final suffixes are indicated with suffix, while pieces that must + # be followed are written as is. In BERT the prefixes are written as is + # while the parts that must follow (not be followed!) have '##' prefix. + # There is no one-to-one coversion. But at least we may make pieces that + # may form a full word look the same. + # Note that we should be very careful now, + # tokenizer.convert_tokens_to_ids won't work with our vocabulary. + + def convert_word(word): + if word == GPT2_EOS: + return word + + if word.startswith('Ġ'): # the token starts with a whitespace + return word[1:] + + return f'_{word}_' # the token not start with a white space. + # may be not a head of a word, + # or may be a head of a sentence. + + _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items())) + self.vocab = [convert_word(word) for word in gpt_vocab] + self._init_inverse_vocab() + + # Load pre-trained model (weights) + self.gpt_model = GPT2LMHeadModel.from_pretrained(gpt_model_name) + self.gpt_model.eval() + # print(self.gpt_model.config) + + # Sanity check. + assert len(self.vocab) == self.gpt_model.config.vocab_size + #assert 0 == self.gpt_model.config.n_special + + self.eos_id = self.gpt_model.config.eos_token_id + self.pad_id = self.gpt_model.config.eos_token_id + self.unk_id = self.gpt_model.config.eos_token_id + self.bos_id = self.gpt_model.config.bos_token_id + self.model_vocab = self.vocab + + def _cuda(self): + self.gpt_model.cuda() + + def get_id(self, string): + indexed_string = self.tokenizer.encode(f'a {string}')[1:] + return indexed_string + + def __get_input_tensors(self, sentence_list): + """Concatenates, tokenize and converts a sentences to model inputs. + + Args: + sentence_list: A list of strings. The string may contain a special + [MASK] token. + + Returns: + A tuple (src_tensor, dst_tensor, masked_indices, tokenized_text). + src_tensor: torch.LongTensor with shape (seq_len), the input to + the new without the last symbol and with EOS prepended. + dst_tensor: torch.LongTensor with shape (seq_len). + masked_indices: A list of indices of [MASK] in dst_tensor. + tokenized_text: A list of token string. + """ + # Split the sentence by [MASK] and tokenize the chunks independently. + tokenized_text = [] + masked_indices = [] + for sentence_idx, sentence in enumerate(sentence_list): + if sentence_idx > 0: + tokenized_text.append(self.eos_id) + for chunk_idx, chunk in enumerate(sentence.split('[MASK]')): + if chunk_idx > 0: + masked_indices.append(len(tokenized_text)) + tokenized_text.append(self.unk_id) # use UNK as [MASK] + chunk = chunk.strip() + if chunk: + tokenized_sentence = self.tokenizer.encode(chunk) + tokenized_text.extend(tokenized_sentence) + + full_indexed_tokens = [ + self.bos_id + ] + tokenized_text + full_tokens_tensor = torch.tensor(full_indexed_tokens) + src_tensor = full_tokens_tensor[:-1] + dst_tensor = full_tokens_tensor[1:] + + tokenized_text = self.tokenizer.decode(tokenized_text) + + return src_tensor, dst_tensor, masked_indices, tokenized_text + + def get_batch_generation(self, sentences_list, logger=None, try_cuda=True): + if try_cuda: + self.try_cuda() + src_tensor_list, dst_tensor_list, masked_indices_list, _ = zip(*[ + self.__get_input_tensors(sentences) for sentences in sentences_list + ]) + + src_tensor_batch = torch.nn.utils.rnn.pad_sequence( + src_tensor_list, batch_first=True) + + # The model uses shared embedding space for tokens and positions. More + # precisely, the first len(vocab) indidices are reseved for words, the + # last n_special symbols are reserved for special symbols and the rest + # is used for positions. Softmax and embedding matrices are shared and + # as result some of output "symbols" correspond to positions. To fix + # that we have to manually remove logits for positions. + with torch.no_grad(): + logits = self.gpt_model(src_tensor_batch.to(self._model_device)) + logits = logits[0] + logits = logits[..., :self.gpt_model.config.vocab_size] + + log_probs = torch.nn.functional.log_softmax(logits, dim=-1).cpu() + + token_ids_list = [ + np.array(dst_tensor.numpy()) for dst_tensor in dst_tensor_list + ] + + return log_probs, token_ids_list, masked_indices_list + + def get_contextual_embeddings(self, sentences_list, try_cuda=True): + + if try_cuda: + self.try_cuda() + + src_tensor_list, dst_tensor_list, masked_indices_list, _ = zip(*[ + self.__get_input_tensors(sentences) for sentences in sentences_list + ]) + + src_tensor_batch = torch.nn.utils.rnn.pad_sequence( + src_tensor_list, batch_first=True) + + with torch.no_grad(): + output = self.gpt_model.transformer(src_tensor_batch.to(self._model_device)) + + # TODO + sentence_lengths = None + tokenized_text_list = None + + # As we only return the last layer, [] to have the same format as other models + return [output], sentence_lengths, tokenized_text_list + + diff --git a/lama/options.py b/lama/options.py index cfd1284..0d771d9 100644 --- a/lama/options.py +++ b/lama/options.py @@ -49,6 +49,7 @@ def get_general_parser(): __add_transformerxl_args(parser) __add_roberta_args(parser) __add_hfroberta_args(parser) + __add_gpt2_args(parser) return parser @@ -193,6 +194,24 @@ def __add_gpt_args(parser): return group +def __add_gpt2_args(parser): + group = parser.add_argument_group("GPT2") + group.add_argument( + "--gpt2-model-dir", + "--g2d", + dest="gpt2_model_dir", + help="directory that contains the gpt2 pre-trained model and the vocabulary", + ) + group.add_argument( + "--gpt2-model-name", + "--g2n", + dest="gpt2_model_name", + default="gpt2", + help="name of the gpt2 pre-trained model (default = 'gpt2')", + ) + return group + + def __add_transformerxl_args(parser): group = parser.add_argument_group("GPT") group.add_argument( diff --git a/lama/vocab_intersection.py b/lama/vocab_intersection.py index a70da3e..702a161 100644 --- a/lama/vocab_intersection.py +++ b/lama/vocab_intersection.py @@ -12,92 +12,98 @@ CASED_MODELS = [ - # { - # # "FAIRSEQ WIKI103" - # "lm": "fairseq", - # "data": "pre-trained_language_models/fairseq/wiki103_fconv_lm/", - # "fairseq_model_name": "wiki103.pt", - # "task": "language_modeling", - # "cpu": True, - # "output_dictionary_size": -1 - # }, - # { - # # "TransformerXL" - # "lm": "transformerxl", - # "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", - # }, - { - # "ELMO ORIGINAL" - "lm": "elmo", - "elmo_model_dir": "pre-trained_language_models/elmo/original", - "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway", - "elmo_vocab_name": "vocab-2016-09-10.txt", - "elmo_warm_up_cycles": 5 - }, - { - # "ELMO ORIGINAL 5.5B" - "lm": "elmo", - "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B", - "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B", - "elmo_vocab_name": "vocab-enwiki-news-500000.txt", - "elmo_warm_up_cycles": 5 - }, - { - # "BERT BASE CASED" - "lm": "bert", - "bert_model_name": "bert-base-cased", - "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", - "bert_vocab_name": "vocab.txt" - }, - { - # "BERT LARGE CASED" - "lm" : "bert", - "bert_model_name": "bert-large-cased", - "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", - "bert_vocab_name": "vocab.txt" - }, - { - # "RoBERTa base" - "lm" : "roberta", - "roberta_model_name": "model.pt", - "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", - "roberta_vocab_name": "dict.txt", - "max_sentence_length": 100 - }, - { - # "hfRoBERTa base" - "lm" : "hfroberta", - "hfroberta_model_name": "roberta-base", - "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", - } +# { + # # "FAIRSEQ WIKI103" + # "lm": "fairseq", + # "data": "pre-trained_language_models/fairseq/wiki103_fconv_lm/", + # "fairseq_model_name": "wiki103.pt", + # "task": "language_modeling", + # "cpu": True, + # "output_dictionary_size": -1 + # }, + # { + # # "TransformerXL" + # "lm": "transformerxl", + # "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", +# }, + { + # "ELMO ORIGINAL" + "lm": "elmo", + "elmo_model_dir": "pre-trained_language_models/elmo/original", + "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway", + "elmo_vocab_name": "vocab-2016-09-10.txt", + "elmo_warm_up_cycles": 5 + }, + { + # "ELMO ORIGINAL 5.5B" + "lm": "elmo", + "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B", + "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B", + "elmo_vocab_name": "vocab-enwiki-news-500000.txt", + "elmo_warm_up_cycles": 5 + }, + { + # "BERT BASE CASED" + "lm": "bert", + "bert_model_name": "bert-base-cased", + "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", + "bert_vocab_name": "vocab.txt" + }, + { + # "BERT LARGE CASED" + "lm" : "bert", + "bert_model_name": "bert-large-cased", + "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", + "bert_vocab_name": "vocab.txt" + }, + { + # "RoBERTa base" + "lm" : "roberta", + "roberta_model_name": "model.pt", + "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", + "roberta_vocab_name": "dict.txt", + "max_sentence_length": 100 + }, + { + # "hfRoBERTa base" + "lm" : "hfroberta", + "hfroberta_model_name": "roberta-base", + "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", + }, + { + # "OpenAI GPT-2" + "lm": "gpt2", + "gpt2_model_name": "gpt2", + "gpt2_model_dir": "pre-trained_language_models/gpt/gpt2", + }, ] CASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_cased.txt" LOWERCASED_MODELS = [ - { - # "BERT BASE UNCASED" - "lm": "bert", - "bert_model_name": "bert-base-uncased", - #"bert_model_dir": None, - "bert_model_dir": "pre-trained_language_models/bert/uncased_L-12_H-768_A-12", - "bert_vocab_name": "vocab.txt" - }, - { - # "BERT LARGE UNCASED" - "lm": "bert", - "bert_model_name": "bert-large-uncased", - #"bert_model_dir": None, - "bert_model_dir": "pre-trained_language_models/bert/uncased_L-24_H-1024_A-16", - "bert_vocab_name": "vocab.txt" - }, - { - # "OpenAI GPT" - "lm": "gpt", - #"gpt_model_dir": None, - "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt", - "gpt_model_name": "openai-gpt" - } + { + # "BERT BASE UNCASED" + "lm": "bert", + "bert_model_name": "bert-base-uncased", + #"bert_model_dir": None, + "bert_model_dir": "pre-trained_language_models/bert/uncased_L-12_H-768_A-12", + "bert_vocab_name": "vocab.txt" + }, + { + # "BERT LARGE UNCASED" + "lm": "bert", + "bert_model_name": "bert-large-uncased", + #"bert_model_dir": None, + "bert_model_dir": "pre-trained_language_models/bert/uncased_L-24_H-1024_A-16", + "bert_vocab_name": "vocab.txt" + }, + { + # "OpenAI GPT" + "lm": "gpt", + #"gpt_model_dir": None, + "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt", + "gpt_model_name": "openai-gpt" + }, ] LOWERCASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_lowercased.txt" diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index a6c20e2..5834e3b 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -42,37 +42,37 @@ # "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B/", # "elmo_warm_up_cycles": 10, #}, - #{ - # "lm": "bert", - # "label": "bert_base", - # "models_names": ["bert"], - # "bert_model_name": "bert-base-cased", - # "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", - #}, - #{ - # "lm": "bert", - # "label": "bert_large", - # "models_names": ["bert"], - # "bert_model_name": "bert-large-cased", - # "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", - #}, - #{ - # "lm": "gpt", - # "label": "gpt", - # "models_names": ["gpt"], - # "gpt_model_name": "openai-gpt", - # "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt/", - # "lowercase": True, - # "common_vocab_filename": "pre-trained_language_models/common_vocab_lowercased.txt", - #}, - #{ - # "lm": "roberta", - # "label": "roberta.base", - # "models_names": ["roberta"], - # "roberta_model_name": "model.pt", - # "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", - # "roberta_vocab_name": "dict.txt", - #}, + { + "lm": "bert", + "label": "bert_base", + "models_names": ["bert"], + "bert_model_name": "bert-base-cased", + "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", + }, + { + "lm": "bert", + "label": "bert_large", + "models_names": ["bert"], + "bert_model_name": "bert-large-cased", + "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", + }, + { + "lm": "gpt", + "label": "gpt", + "models_names": ["gpt"], + "gpt_model_name": "openai-gpt", + "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt/", + "lowercase": True, + "common_vocab_filename": "pre-trained_language_models/common_vocab_lowercased.txt", + }, + { + "lm": "roberta", + "label": "roberta.base", + "models_names": ["roberta"], + "roberta_model_name": "model.pt", + "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", + "roberta_vocab_name": "dict.txt", + }, { "lm": "hfroberta", "label": "roberta-base", @@ -80,6 +80,15 @@ "hfroberta_model_name": "roberta-base", "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", }, + { + # "OpenAIGPT2" + "lm": "gpt2", + "label": "gpt2", + "models_names": ["gpt2"], + "gpt2_model_name": "gpt2", + "gpt2_model_dir": "pre-trained_language_models/gpt/gpt2", + }, + ] From afcf1488bd180e7aac4d9ad3402878172987a434 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 15 Feb 2021 22:59:30 +0900 Subject: [PATCH 35/48] fix hfroberta --- lama/modules/hfroberta_connector.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index 06321b2..f16f020 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -164,6 +164,11 @@ def __get_input_tensors(self, sentences): tokenized_text = [] masked_indices = [] segment_indices = [] + + # add [CLS] or [BOS] token at the beginning + tokenized_text.append(ROBERTA_START_SENTENCE) + segment_indices.append(0) + for sentence_idx, sentence in enumerate(sentences): for chunk_idx, chunk in enumerate(sentence.split('[MASK]')): if chunk_idx > 0: @@ -185,10 +190,6 @@ def __get_input_tensors(self, sentences): tokenized_text.append(ROBERTA_END_SENTENCE) segment_indices.append(sentence_idx) - # add [CLS] or [BOS] token at the beginning - tokenized_text.insert(0,ROBERTA_START_SENTENCE) - segment_indices.insert(0,0) - # look for masked indices masked_indices = [] for i in range(len(tokenized_text)): From 156b888fb31e9295e750c2c1ba7b89f6b91e5171 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 15 Feb 2021 23:48:53 +0900 Subject: [PATCH 36/48] update README_morioka.md --- README_morioka.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README_morioka.md b/README_morioka.md index 973cd18..c86521f 100644 --- a/README_morioka.md +++ b/README_morioka.md @@ -2,6 +2,7 @@ 2021-01-12 2021-01-29 更新 +2021-02-15 更新 Yasuhiro MORIOKA @@ -9,13 +10,16 @@ Yasuhiro MORIOKA LAMA(LAnguage Model Analysis) の環境をそのまま使って、結果の再現を試みた。 README.md の手順どおりに進めただけで、独自のデータセットやモデルで確認していない。 +LAMA-UHN, Negated-LAMA は試しておきたい。 BERT, BERT-large, Elmo については Google-RE, T-REx でほぼ同様の結果を得た。Elmo-5Bは未確認。 -Transformer-XL, GPTはおそらくメモリサイズの問題で実行できない。 +Transformer-XL, GPTはおそらく実行環境のメモリサイズの問題で実行できていない。 メモリサイズの問題か ConceptNetの評価中に強制終了する。 RoBERTaは配布状態ではサポートされないが、動作させた。BERTより少し劣る結果。 +GPT-2もサポートした。 + ## 内容 * 環境 @@ -33,6 +37,7 @@ RoBERTaは配布状態ではサポートされないが、動作させた。BERT * bert_connectorを基本に gpt_connector のtokernizerの扱いを流用。 * MRR, precision計算での len(list_of_results) == 0 の場合を処理 * CUDAが利用できない場合の警告メッセージを抑制 + * GPT-2 モデルダウンロード、connectorを作成。 * 実行 * "The LAMA probe" の手順をそのまま実行。 From 772dc08695b2808b952443b860fe99a5f5578232 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Mon, 15 Feb 2021 23:52:41 +0900 Subject: [PATCH 37/48] update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fc1bd7e..c1f6b73 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ LAMA exposes a transparent and unique interface to use: - BERT (Devlin et al., 2018) - ELMo (Peters et al., 2018) - GPT (Radford et al., 2018) +- GPT-2 (Radford et al., 2019) - RoBERTa (Liu et al., 2019) Actually, LAMA is also a beautiful animal. @@ -217,7 +218,7 @@ BERT pretrained models can be loaded both: (i) passing the name of the model and * __--gpt-model-name/--gmn__ : name of the gpt pre-trained model (default = 'openai-gpt') -### GPT2 +### GPT-2 * __--gpt2-model-dir/--g2d__ : directory that contains the gpt2 pre-trained model and the vocabulary (__REQUIRED__) * __--gpt2-model-name/--g2n__ : name of the gpt2 pre-trained model (default = 'gpt2') From ceec5f6739e89aacb2d6b871592390a458cd7622 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Tue, 16 Feb 2021 08:39:36 +0900 Subject: [PATCH 38/48] quick fix not masked_sentences but masked_sentence issue in TREx dataset --- scripts/batch_eval_KB_completion.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index ac806c0..edd1ca9 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -203,10 +203,17 @@ def lowercase_samples(samples, use_negated_probes=False): sample["obj_label"] = sample["obj_label"].lower() sample["sub_label"] = sample["sub_label"].lower() lower_masked_sentences = [] - for sentence in sample["masked_sentences"]: - sentence = sentence.lower() - sentence = sentence.replace(base.MASK.lower(), base.MASK) - lower_masked_sentences.append(sentence) + try: + for sentence in sample["masked_sentences"]: + sentence = sentence.lower() + sentence = sentence.replace(base.MASK.lower(), base.MASK) + lower_masked_sentences.append(sentence) + except KeyError: + for evidence in sample['evidences']: # TREx + sentence = evidence['masked_sentence'] + sentence = sentence.lower() + sentence = sentence.replace(base.MASK.lower(), base.MASK) + lower_masked_sentences.append(sentence) sample["masked_sentences"] = lower_masked_sentences if "negated" in sample and use_negated_probes: @@ -229,7 +236,7 @@ def filter_samples(model, samples, vocab_subset, max_sentence_length, template): if "obj_label" in sample and "sub_label" in sample: obj_label_ids = model.get_id(sample["obj_label"]) -# print(f'obj_label: {sample["obj_label"]} -> {obj_label_ids}') + # print(f'obj_label: {sample["obj_label"]} -> {obj_label_ids}') if obj_label_ids: recostructed_word = " ".join( @@ -238,7 +245,7 @@ def filter_samples(model, samples, vocab_subset, max_sentence_length, template): else: recostructed_word = None -# print(f'reconstructed_word: {recostructed_word}') + # print(f'reconstructed_word: {recostructed_word}') excluded = False if not template or len(template) == 0: @@ -392,6 +399,14 @@ def main(args, shuffle_data=True, model=None): else: # keep samples as they are all_samples = data + # TREx data + for i, sample in enumerate(all_samples): + if 'masked_sentences' not in sample.keys(): + sample['masked_sentences'] = [] + for evidence in sample['evidences']: + sample['masked_sentences'].append(evidence['masked_sentence']) + if i == 0: + print('not masked_sentences, but masked_sentence.') all_samples, ret_msg = filter_samples( model, data, vocab_subset, args.max_sentence_length, args.template From 527754ecc357abf087e1d514263a74104b048692 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Tue, 16 Feb 2021 13:08:22 +0900 Subject: [PATCH 39/48] clean --- lama/modules/gpt2_connector.py | 3 +-- lama/modules/hfroberta_connector.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lama/modules/gpt2_connector.py b/lama/modules/gpt2_connector.py index c3da7e0..8e5b7b2 100644 --- a/lama/modules/gpt2_connector.py +++ b/lama/modules/gpt2_connector.py @@ -131,8 +131,7 @@ def get_batch_generation(self, sentences_list, logger=None, try_cuda=True): # as result some of output "symbols" correspond to positions. To fix # that we have to manually remove logits for positions. with torch.no_grad(): - logits = self.gpt_model(src_tensor_batch.to(self._model_device)) - logits = logits[0] + logits = self.gpt_model(src_tensor_batch.to(self._model_device))[0] logits = logits[..., :self.gpt_model.config.vocab_size] log_probs = torch.nn.functional.log_softmax(logits, dim=-1).cpu() diff --git a/lama/modules/hfroberta_connector.py b/lama/modules/hfroberta_connector.py index f16f020..4cf7c94 100644 --- a/lama/modules/hfroberta_connector.py +++ b/lama/modules/hfroberta_connector.py @@ -222,8 +222,7 @@ def get_batch_generation(self, sentences_list, logger=None, try_cuda=True): input_ids=tokens_tensor.to(self._model_device), token_type_ids=segments_tensor.to(self._model_device), attention_mask=attention_mask_tensor.to(self._model_device), - ) - logits = logits[0] + )[0] log_probs = F.log_softmax(logits, dim=-1).cpu() From 719f21776063a0952b71a4c84c7215a2d55f6ca9 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Thu, 18 Feb 2021 22:17:48 +0900 Subject: [PATCH 40/48] fix --- scripts/batch_eval_KB_completion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index edd1ca9..a924ea9 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -405,8 +405,8 @@ def main(args, shuffle_data=True, model=None): sample['masked_sentences'] = [] for evidence in sample['evidences']: sample['masked_sentences'].append(evidence['masked_sentence']) - if i == 0: - print('not masked_sentences, but masked_sentence.') + if i == 0: + print('not masked_sentences, but masked_sentence.') all_samples, ret_msg = filter_samples( model, data, vocab_subset, args.max_sentence_length, args.template From 9b9833613899022ca1424e56c0d5479f1504b121 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Thu, 18 Feb 2021 22:18:56 +0900 Subject: [PATCH 41/48] add switch for Negated-LAMA --- scripts/run_experiments.py | 44 +++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 5834e3b..7bf50ce 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -24,24 +24,24 @@ # "transformerxl_model_name": "transfo-xl-wt103", # "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", #}, - #{ - # "lm": "elmo", - # "label": "elmo", - # "models_names": ["elmo"], - # "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway", - # "elmo_vocab_name": "vocab-2016-09-10.txt", - # "elmo_model_dir": "pre-trained_language_models/elmo/original", - # "elmo_warm_up_cycles": 10, - #}, - #{ - # "lm": "elmo", - # "label": "elmo5B", - # "models_names": ["elmo"], - # "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B", - # "elmo_vocab_name": "vocab-enwiki-news-500000.txt", - # "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B/", - # "elmo_warm_up_cycles": 10, - #}, + { + "lm": "elmo", + "label": "elmo", + "models_names": ["elmo"], + "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway", + "elmo_vocab_name": "vocab-2016-09-10.txt", + "elmo_model_dir": "pre-trained_language_models/elmo/original", + "elmo_warm_up_cycles": 10, + }, + { + "lm": "elmo", + "label": "elmo5B", + "models_names": ["elmo"], + "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B", + "elmo_vocab_name": "vocab-enwiki-news-500000.txt", + "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B/", + "elmo_warm_up_cycles": 10, + }, { "lm": "bert", "label": "bert_base", @@ -81,17 +81,14 @@ "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", }, { - # "OpenAIGPT2" "lm": "gpt2", "label": "gpt2", "models_names": ["gpt2"], "gpt2_model_name": "gpt2", "gpt2_model_dir": "pre-trained_language_models/gpt/gpt2", }, - ] - def run_experiments( relations, data_path_pre, @@ -246,7 +243,10 @@ def get_Squad_parameters(data_path_pre="data/"): def run_all_LMs(parameters): for ip in LMs: print(ip["label"]) - run_experiments(*parameters, input_param=ip, use_negated_probes=False) + + use_negated_probes = False # vanilla LAMA + # use_negated_probes = True # Negated-LAMA + run_experiments(*parameters, input_param=ip, use_negated_probes=use_negated_probes) if __name__ == "__main__": From 831652a8bee356778dd3c0c5f9aaa58a2d0e309e Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 24 Feb 2021 23:47:42 +0900 Subject: [PATCH 42/48] fix sub_label issue in lowercase --- scripts/batch_eval_KB_completion.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index a924ea9..a7b5d02 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -201,7 +201,10 @@ def lowercase_samples(samples, use_negated_probes=False): new_samples = [] for sample in samples: sample["obj_label"] = sample["obj_label"].lower() - sample["sub_label"] = sample["sub_label"].lower() + try: + sample["sub_label"] = sample["sub_label"].lower() + except KeyError: # ConceptNet + None lower_masked_sentences = [] try: for sentence in sample["masked_sentences"]: From 2f6351a3f69a3cd29493b0f4a957f77bf63a4839 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Tue, 17 Aug 2021 21:39:16 +0900 Subject: [PATCH 43/48] fix handling elmo --- scripts/run_experiments.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 7bf50ce..b2c10b8 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -154,10 +154,11 @@ def run_experiments( print("Exception: {}".format(e)) continue - # https://github.com/facebookresearch/LAMA/issues/30 - if model is not None: - del model - model = None + # fix https://github.com/facebookresearch/LAMA/issues/30 + if input_param["lm"] in ["elmo"]: + if model is not None: + del model + model = None if model is None: [model_type_name] = args.models_names From 0f6204c5a87a9769b29222fb95772ff9f1f7e6c1 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Tue, 17 Aug 2021 23:21:09 +0900 Subject: [PATCH 44/48] clean --- README_morioka.md | 96 --------------- lama/modules/base_connector.py | 6 +- lama/modules/gpt_connector.py | 2 +- lama/options.py | 2 +- lama/vocab_intersection.py | 174 ++++++++++++++-------------- scripts/batch_eval_KB_completion.py | 3 - scripts/run_experiments.py | 4 +- 7 files changed, 93 insertions(+), 194 deletions(-) delete mode 100644 README_morioka.md diff --git a/README_morioka.md b/README_morioka.md deleted file mode 100644 index c86521f..0000000 --- a/README_morioka.md +++ /dev/null @@ -1,96 +0,0 @@ -# LAMA(LAnguage Model Analysis) を再確認してみた - -2021-01-12 -2021-01-29 更新 -2021-02-15 更新 - -Yasuhiro MORIOKA - -## 概要 - -LAMA(LAnguage Model Analysis) の環境をそのまま使って、結果の再現を試みた。 -README.md の手順どおりに進めただけで、独自のデータセットやモデルで確認していない。 -LAMA-UHN, Negated-LAMA は試しておきたい。 - -BERT, BERT-large, Elmo については Google-RE, T-REx でほぼ同様の結果を得た。Elmo-5Bは未確認。 -Transformer-XL, GPTはおそらく実行環境のメモリサイズの問題で実行できていない。 -メモリサイズの問題か ConceptNetの評価中に強制終了する。 - -RoBERTaは配布状態ではサポートされないが、動作させた。BERTより少し劣る結果。 - -GPT-2もサポートした。 - -## 内容 - -* 環境 - * ThinkPad E495 (AMD Ryzen5 2.1GHz, RAM 32GB, GPUなし) - * Windows 10 Home, WSL2, Ubuntu 20.04 -* 修正 - * Elmoモデルの状態クリアを追加 - * https://github.com/facebookresearch/LAMA/issues/30 - * 非効率だが relation 別の評価のたびに¥モデルをクリア。 - * GPT, FairSeq RoBERTa 向け pre-trained_language_models を定義 - * FairSeq/HuggingFace Roberta向けSpecial tokenを定義 - * Fairseq RoBERTa向けモデルダウンロード、vocaburaryのintersection取得を修正 - * HuggingFace RoBERTa向けモデルダウンロード, HfRobertaConnectorを作成 - * https://github.com/facebookresearch/LAMA/issues/15 - * bert_connectorを基本に gpt_connector のtokernizerの扱いを流用。 - * MRR, precision計算での len(list_of_results) == 0 の場合を処理 - * CUDAが利用できない場合の警告メッセージを抑制 - * GPT-2 モデルダウンロード、connectorを作成。 - -* 実行 - * "The LAMA probe" の手順をそのまま実行。 - -* 実行時の注意 - - * pyenv-virtualenv 構成で minoconda-3.7 環境を用意し、さらに lama37環境を用意した。 - * pyenv と anaconda のactivateが衝突するので、次の手順で回避。 - * またローカルのlamaモジュールをロードできるようPYTHONPATHを修正。 - - ``` - $ pyenv activate miniconda-3.7.0/envs/lama37 - $ export PYTHONPATH=.'' - $ python scripts/run_experiments.py 2>&1 | tee output.log - ``` - - * 参考 - * pyenvとanacondaを共存させる時のactivate衝突問題の回避策3種類 - Qiita - * https://qiita.com/y__sama/items/f732bb7bec2bff355b69 - * ModuleNotFoundError: No module named 'lama' · Issue #20 · facebookresearch/LAMA - * https://github.com/facebookresearch/LAMA/issues/20 - - -* 結果 - * BERT, BERT-large .. ほぼ論文のとおり。 - * Elmo .. Google-RE, T-REx の評価後、ConceptNetでの評価に移るが途中でエラーも警告も出力せずに終了する。 - * Elmo-5B .. 未実施 - * Transformer-XL .. RuntimeError: $ Torch: invalid memory size -- maybe an overflow? at /pytorch/aten/src/TH/THGeneral.cpp:188 エラー。 - * GPT .. 大量の word FOO from vocab_subset in model vocabulary! 警告が表示され、評価回数が0となって div0 エラー。 - * Fairseq RoBERTa .. BERTよりも少し悪い。ConceptNetの評価中にメモリ確保エラー。 - * https://github.com/facebookresearch/LAMA/issues/16 - * HuggingFace RoBERTa .. Fairseq RoBERTaと同様。 - * 各種エラーはメモリサイズさえ大きければ生じない印象。 - -* TODO - * P27に対応する文テンプレートが T-REx では不適切 https://github.com/facebookresearch/LAMA/issues/40 - -* 疑問 - * vocab_intersection での除外条件が粗い印象。これしかないのかもしれないが。 - * nlp = spacy('en') に基づく。 - * stop_word は上記 nlp が返すもの - * punctuation, symbolは各wordを nlp(word)の入力として tokenize結果のPOSで判断。よって数字だけでもPUNCTと判断されるケースがたびたび生じている。文脈が不足しているのだろう。 - * BPEを利用しているので、同じ語でも " word" と "word" の2つのトークンが対応する(はず)。前者は文中で登場するもの。後者は文頭に登場するか、語頭に続くものとして登場する。LAMAは1単語が複数トークンに分かれるケースを除外しているので、後者は語彙から除くことになる。しかし、文頭に登場する場合は前者と同様の意味を持つのではないか? obj-rel-subj のうちobjは文頭に出現することが多いだろうと考えると、そこで情報が失われていることはないのだろうか。まずはそこは捨てて考えてみているということか? - -## 参考 - -* https://github.com/facebookresearch/LAMA -* https://arxiv.org/pdf/1909.01066.pdf -* https://openreview.net/forum?id=025X0zPfn - -* https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.md - -* http://lotus.kuee.kyoto-u.ac.jp/~kurita/snlp2019_kurita.pdf -* https://blog.hoxo-m.com/entry/2019/10/24/083000#3-Language-Models-as-Knowledge-Bases -* https://twitter.com/gneubig/status/1177276621172150272 - diff --git a/lama/modules/base_connector.py b/lama/modules/base_connector.py index 1869afd..3127160 100644 --- a/lama/modules/base_connector.py +++ b/lama/modules/base_connector.py @@ -104,11 +104,11 @@ def try_cuda(self): """Move model to GPU if one is available.""" if torch.cuda.is_available(): if self._model_device != 'cuda': - # print('Moving model to CUDA') + print('Moving model to CUDA') self._cuda() self._model_device = 'cuda' - # else: - # print('No CUDA found') + else: + print('No CUDA found') def _cuda(self): """Move model to GPU.""" diff --git a/lama/modules/gpt_connector.py b/lama/modules/gpt_connector.py index a0a96bc..7488cf6 100644 --- a/lama/modules/gpt_connector.py +++ b/lama/modules/gpt_connector.py @@ -54,7 +54,7 @@ def convert_word(word): # Load pre-trained model (weights) self.gpt_model = OpenAIGPTLMHeadModel.from_pretrained(gpt_model_name) self.gpt_model.eval() - # print(self.gpt_model.config) + print(self.gpt_model.config) # Sanity check. assert len(self.vocab) == self.gpt_model.config.vocab_size diff --git a/lama/options.py b/lama/options.py index 0d771d9..e1c754f 100644 --- a/lama/options.py +++ b/lama/options.py @@ -41,7 +41,7 @@ def get_general_parser(): dest="max_sentence_length", type=int, default=100, - help="max sentence length", + help="max sentence lenght", ) __add_bert_args(parser) __add_elmo_args(parser) diff --git a/lama/vocab_intersection.py b/lama/vocab_intersection.py index 702a161..2abf491 100644 --- a/lama/vocab_intersection.py +++ b/lama/vocab_intersection.py @@ -12,98 +12,98 @@ CASED_MODELS = [ -# { - # # "FAIRSEQ WIKI103" - # "lm": "fairseq", - # "data": "pre-trained_language_models/fairseq/wiki103_fconv_lm/", - # "fairseq_model_name": "wiki103.pt", - # "task": "language_modeling", - # "cpu": True, - # "output_dictionary_size": -1 - # }, - # { - # # "TransformerXL" - # "lm": "transformerxl", - # "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", -# }, - { - # "ELMO ORIGINAL" - "lm": "elmo", - "elmo_model_dir": "pre-trained_language_models/elmo/original", - "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway", - "elmo_vocab_name": "vocab-2016-09-10.txt", - "elmo_warm_up_cycles": 5 - }, - { - # "ELMO ORIGINAL 5.5B" - "lm": "elmo", - "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B", - "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B", - "elmo_vocab_name": "vocab-enwiki-news-500000.txt", - "elmo_warm_up_cycles": 5 - }, - { - # "BERT BASE CASED" - "lm": "bert", - "bert_model_name": "bert-base-cased", - "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", - "bert_vocab_name": "vocab.txt" - }, - { - # "BERT LARGE CASED" - "lm" : "bert", - "bert_model_name": "bert-large-cased", - "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", - "bert_vocab_name": "vocab.txt" - }, - { - # "RoBERTa base" - "lm" : "roberta", - "roberta_model_name": "model.pt", - "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", - "roberta_vocab_name": "dict.txt", - "max_sentence_length": 100 - }, - { - # "hfRoBERTa base" - "lm" : "hfroberta", - "hfroberta_model_name": "roberta-base", - "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", - }, - { - # "OpenAI GPT-2" - "lm": "gpt2", - "gpt2_model_name": "gpt2", - "gpt2_model_dir": "pre-trained_language_models/gpt/gpt2", - }, + # { + # # "FAIRSEQ WIKI103" + # "lm": "fairseq", + # "data": "pre-trained_language_models/fairseq/wiki103_fconv_lm/", + # "fairseq_model_name": "wiki103.pt", + # "task": "language_modeling", + # "cpu": True, + # "output_dictionary_size": -1 + # }, + # { + # # "TransformerXL" + # "lm": "transformerxl", + # "transformerxl_model_dir": "pre-trained_language_models/transformerxl/transfo-xl-wt103/", + # }, + { + # "ELMO ORIGINAL" + "lm": "elmo", + "elmo_model_dir": "pre-trained_language_models/elmo/original", + "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway", + "elmo_vocab_name": "vocab-2016-09-10.txt", + "elmo_warm_up_cycles": 5 + }, + { + # "ELMO ORIGINAL 5.5B" + "lm": "elmo", + "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B", + "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B", + "elmo_vocab_name": "vocab-enwiki-news-500000.txt", + "elmo_warm_up_cycles": 5 + }, + { + # "BERT BASE CASED" + "lm": "bert", + "bert_model_name": "bert-base-cased", + "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", + "bert_vocab_name": "vocab.txt" + }, + { + # "BERT LARGE CASED" + "lm" : "bert", + "bert_model_name": "bert-large-cased", + "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", + "bert_vocab_name": "vocab.txt" + }, + { + # "RoBERTa base" + "lm" : "roberta", + "roberta_model_name": "model.pt", + "roberta_model_dir": "pre-trained_language_models/roberta/roberta.base", + "roberta_vocab_name": "dict.txt", + "max_sentence_length": 100 + }, + { + # "hfRoBERTa base" + "lm" : "hfroberta", + "hfroberta_model_name": "roberta-base", + "hfroberta_model_dir": "pre-trained_language_models/roberta/roberta-base", + }, + { + # "OpenAI GPT-2" + "lm": "gpt2", + "gpt2_model_name": "gpt2", + "gpt2_model_dir": "pre-trained_language_models/gpt/gpt2", + }, ] CASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_cased.txt" LOWERCASED_MODELS = [ - { - # "BERT BASE UNCASED" - "lm": "bert", - "bert_model_name": "bert-base-uncased", - #"bert_model_dir": None, - "bert_model_dir": "pre-trained_language_models/bert/uncased_L-12_H-768_A-12", - "bert_vocab_name": "vocab.txt" - }, - { - # "BERT LARGE UNCASED" - "lm": "bert", - "bert_model_name": "bert-large-uncased", - #"bert_model_dir": None, - "bert_model_dir": "pre-trained_language_models/bert/uncased_L-24_H-1024_A-16", - "bert_vocab_name": "vocab.txt" - }, - { - # "OpenAI GPT" - "lm": "gpt", - #"gpt_model_dir": None, - "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt", - "gpt_model_name": "openai-gpt" - }, + { + # "BERT BASE UNCASED" + "lm": "bert", + "bert_model_name": "bert-base-uncased", + #"bert_model_dir": None, + "bert_model_dir": "pre-trained_language_models/bert/uncased_L-12_H-768_A-12", + "bert_vocab_name": "vocab.txt" + }, + { + # "BERT LARGE UNCASED" + "lm": "bert", + "bert_model_name": "bert-large-uncased", + #"bert_model_dir": None, + "bert_model_dir": "pre-trained_language_models/bert/uncased_L-24_H-1024_A-16", + "bert_vocab_name": "vocab.txt" + }, + { + # "OpenAI GPT" + "lm": "gpt", + #"gpt_model_dir": None, + "gpt_model_dir": "pre-trained_language_models/gpt/openai-gpt", + "gpt_model_name": "openai-gpt" + }, ] LOWERCASED_COMMON_VOCAB_FILENAME = "pre-trained_language_models/common_vocab_lowercased.txt" diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index a7b5d02..708c11d 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -239,7 +239,6 @@ def filter_samples(model, samples, vocab_subset, max_sentence_length, template): if "obj_label" in sample and "sub_label" in sample: obj_label_ids = model.get_id(sample["obj_label"]) - # print(f'obj_label: {sample["obj_label"]} -> {obj_label_ids}') if obj_label_ids: recostructed_word = " ".join( @@ -248,8 +247,6 @@ def filter_samples(model, samples, vocab_subset, max_sentence_length, template): else: recostructed_word = None - # print(f'reconstructed_word: {recostructed_word}') - excluded = False if not template or len(template) == 0: masked_sentences = sample["masked_sentences"] diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index b2c10b8..5e48266 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -245,9 +245,7 @@ def run_all_LMs(parameters): for ip in LMs: print(ip["label"]) - use_negated_probes = False # vanilla LAMA - # use_negated_probes = True # Negated-LAMA - run_experiments(*parameters, input_param=ip, use_negated_probes=use_negated_probes) + run_experiments(*parameters, input_param=ip, use_negated_probes=False) if __name__ == "__main__": From 99a2590ff2a74cf384a656941000f2b05c3d0e89 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Tue, 17 Aug 2021 23:32:10 +0900 Subject: [PATCH 45/48] clean --- lama/vocab_intersection.py | 6 +++--- scripts/run_experiments.py | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lama/vocab_intersection.py b/lama/vocab_intersection.py index 2abf491..a838814 100644 --- a/lama/vocab_intersection.py +++ b/lama/vocab_intersection.py @@ -37,7 +37,7 @@ { # "ELMO ORIGINAL 5.5B" "lm": "elmo", - "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B", + "elmo_model_dir": "pre-trained_language_models/elmo/original5.5B/", "elmo_model_name": "elmo_2x4096_512_2048cnn_2xhighway_5.5B", "elmo_vocab_name": "vocab-enwiki-news-500000.txt", "elmo_warm_up_cycles": 5 @@ -46,14 +46,14 @@ # "BERT BASE CASED" "lm": "bert", "bert_model_name": "bert-base-cased", - "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12", + "bert_model_dir": "pre-trained_language_models/bert/cased_L-12_H-768_A-12/", "bert_vocab_name": "vocab.txt" }, { # "BERT LARGE CASED" "lm" : "bert", "bert_model_name": "bert-large-cased", - "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16", + "bert_model_dir": "pre-trained_language_models/bert/cased_L-24_H-1024_A-16/", "bert_vocab_name": "vocab.txt" }, { diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 5e48266..8063e6c 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -111,7 +111,7 @@ def run_experiments( results_file = open("last_results.csv", "a+") results_file.write( - "=={}==\n".format(input_param["label"]) + "{},{}\n".format("lm_label", input_param["label"]) ) results_file.flush() @@ -244,7 +244,6 @@ def get_Squad_parameters(data_path_pre="data/"): def run_all_LMs(parameters): for ip in LMs: print(ip["label"]) - run_experiments(*parameters, input_param=ip, use_negated_probes=False) From 99ae7954c49c7939118a5f65606ea3abcc9d921b Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 18 Aug 2021 18:19:56 +0900 Subject: [PATCH 46/48] remove fix #30 and #31, tentatively --- scripts/batch_eval_KB_completion.py | 28 +++++----------------------- scripts/run_experiments.py | 6 ------ 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index 708c11d..1b83d49 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -201,22 +201,12 @@ def lowercase_samples(samples, use_negated_probes=False): new_samples = [] for sample in samples: sample["obj_label"] = sample["obj_label"].lower() - try: - sample["sub_label"] = sample["sub_label"].lower() - except KeyError: # ConceptNet - None + sample["sub_label"] = sample["sub_label"].lower() lower_masked_sentences = [] - try: - for sentence in sample["masked_sentences"]: - sentence = sentence.lower() - sentence = sentence.replace(base.MASK.lower(), base.MASK) - lower_masked_sentences.append(sentence) - except KeyError: - for evidence in sample['evidences']: # TREx - sentence = evidence['masked_sentence'] - sentence = sentence.lower() - sentence = sentence.replace(base.MASK.lower(), base.MASK) - lower_masked_sentences.append(sentence) + for sentence in sample["masked_sentences"]: + sentence = sentence.lower() + sentence = sentence.replace(base.MASK.lower(), base.MASK) + lower_masked_sentences.append(sentence) sample["masked_sentences"] = lower_masked_sentences if "negated" in sample and use_negated_probes: @@ -399,14 +389,6 @@ def main(args, shuffle_data=True, model=None): else: # keep samples as they are all_samples = data - # TREx data - for i, sample in enumerate(all_samples): - if 'masked_sentences' not in sample.keys(): - sample['masked_sentences'] = [] - for evidence in sample['evidences']: - sample['masked_sentences'].append(evidence['masked_sentence']) - if i == 0: - print('not masked_sentences, but masked_sentence.') all_samples, ret_msg = filter_samples( model, data, vocab_subset, args.max_sentence_length, args.template diff --git a/scripts/run_experiments.py b/scripts/run_experiments.py index 8063e6c..b06c82f 100644 --- a/scripts/run_experiments.py +++ b/scripts/run_experiments.py @@ -154,12 +154,6 @@ def run_experiments( print("Exception: {}".format(e)) continue - # fix https://github.com/facebookresearch/LAMA/issues/30 - if input_param["lm"] in ["elmo"]: - if model is not None: - del model - model = None - if model is None: [model_type_name] = args.models_names model = build_model_by_name(model_type_name, args) From 7a3efee5a6569dcf1d861f373db65042b4605e8d Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Wed, 18 Aug 2021 18:23:38 +0900 Subject: [PATCH 47/48] remove fix #30 and #31, tentatively --- scripts/batch_eval_KB_completion.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index 1b83d49..00bd785 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -650,15 +650,12 @@ def main(args, shuffle_data=True, model=None): pool.join() # stats - try: - # Mean reciprocal rank - MRR /= len(list_of_results) - - # Precision - Precision /= len(list_of_results) - Precision1 /= len(list_of_results) - except ZeroDivisionError: - MRR = Precision = Precision1 = 0.0 + # Mean reciprocal rank + MRR /= len(list_of_results) + + # Precision + Precision /= len(list_of_results) + Precision1 /= len(list_of_results) msg = "all_samples: {}\n".format(len(all_samples)) msg += "list_of_results: {}\n".format(len(list_of_results)) From cf70e4a265d8b1bd570bd4f8ca7b55d8c81a1f93 Mon Sep 17 00:00:00 2001 From: Yasuhiro Morioka Date: Fri, 20 Aug 2021 07:48:42 +0900 Subject: [PATCH 48/48] reactivate fix lowercase_samples() issue --- scripts/batch_eval_KB_completion.py | 43 +++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/scripts/batch_eval_KB_completion.py b/scripts/batch_eval_KB_completion.py index 00bd785..708c11d 100644 --- a/scripts/batch_eval_KB_completion.py +++ b/scripts/batch_eval_KB_completion.py @@ -201,12 +201,22 @@ def lowercase_samples(samples, use_negated_probes=False): new_samples = [] for sample in samples: sample["obj_label"] = sample["obj_label"].lower() - sample["sub_label"] = sample["sub_label"].lower() + try: + sample["sub_label"] = sample["sub_label"].lower() + except KeyError: # ConceptNet + None lower_masked_sentences = [] - for sentence in sample["masked_sentences"]: - sentence = sentence.lower() - sentence = sentence.replace(base.MASK.lower(), base.MASK) - lower_masked_sentences.append(sentence) + try: + for sentence in sample["masked_sentences"]: + sentence = sentence.lower() + sentence = sentence.replace(base.MASK.lower(), base.MASK) + lower_masked_sentences.append(sentence) + except KeyError: + for evidence in sample['evidences']: # TREx + sentence = evidence['masked_sentence'] + sentence = sentence.lower() + sentence = sentence.replace(base.MASK.lower(), base.MASK) + lower_masked_sentences.append(sentence) sample["masked_sentences"] = lower_masked_sentences if "negated" in sample and use_negated_probes: @@ -389,6 +399,14 @@ def main(args, shuffle_data=True, model=None): else: # keep samples as they are all_samples = data + # TREx data + for i, sample in enumerate(all_samples): + if 'masked_sentences' not in sample.keys(): + sample['masked_sentences'] = [] + for evidence in sample['evidences']: + sample['masked_sentences'].append(evidence['masked_sentence']) + if i == 0: + print('not masked_sentences, but masked_sentence.') all_samples, ret_msg = filter_samples( model, data, vocab_subset, args.max_sentence_length, args.template @@ -650,12 +668,15 @@ def main(args, shuffle_data=True, model=None): pool.join() # stats - # Mean reciprocal rank - MRR /= len(list_of_results) - - # Precision - Precision /= len(list_of_results) - Precision1 /= len(list_of_results) + try: + # Mean reciprocal rank + MRR /= len(list_of_results) + + # Precision + Precision /= len(list_of_results) + Precision1 /= len(list_of_results) + except ZeroDivisionError: + MRR = Precision = Precision1 = 0.0 msg = "all_samples: {}\n".format(len(all_samples)) msg += "list_of_results: {}\n".format(len(list_of_results))